From c19c03fc749147f565e807fa65f1729066800571 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 4 Jun 2007 15:15:35 +1000
Subject: [POWERPC] unmap_vm_area becomes unmap_kernel_range for the public

This makes unmap_vm_area static and a wrapper around a new
exported unmap_kernel_range that takes an explicit range instead
of a vm_area struct.

This makes it more versatile for code that wants to play with kernel
page tables outside of the standard vmalloc area.

(One example is some rework of the PowerPC PCI IO space mapping
code that depends on that patch and removes some code duplication
and horrible abuse of forged struct vm_struct).

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/linux/vmalloc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 4b7ee83787c1..132b260aef1e 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -65,9 +65,10 @@ extern struct vm_struct *get_vm_area_node(unsigned long size,
 					  unsigned long flags, int node,
 					  gfp_t gfp_mask);
 extern struct vm_struct *remove_vm_area(void *addr);
+
 extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
 			struct page ***pages);
-extern void unmap_vm_area(struct vm_struct *area);
+extern void unmap_kernel_range(unsigned long addr, unsigned long size);
 
 /*
  *	Internals.  Dont't use..
-- 
cgit v1.3-8-gc7d7


From d7ad2254fa7cc11aec3faeba076c1243f6adeb47 Mon Sep 17 00:00:00 2001
From: John Keller <jpk@sgi.com>
Date: Mon, 9 Jul 2007 11:42:24 -0700
Subject: [IA64] SN: Correct ROM resource length for BIOS copy

On SN systems, when setting the IORESOURCE_ROM_BIOS_COPY resource flag,
the resource length should be set to the actual size of the ROM image
so that a call to pci_map_rom() returns the correct size.

Signed-off-by: John Keller <jpk@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/io_acpi_init.c | 17 +++++----
 arch/ia64/sn/kernel/io_init.c      | 20 +++++++++--
 drivers/pci/rom.c                  | 73 +++++++++++++++++++++++---------------
 include/linux/pci.h                |  1 +
 4 files changed, 70 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/sn/kernel/io_acpi_init.c b/arch/ia64/sn/kernel/io_acpi_init.c
index c6216f454ffb..3c7178f5dce8 100644
--- a/arch/ia64/sn/kernel/io_acpi_init.c
+++ b/arch/ia64/sn/kernel/io_acpi_init.c
@@ -418,7 +418,7 @@ sn_acpi_slot_fixup(struct pci_dev *dev)
 	void __iomem *addr;
 	struct pcidev_info *pcidev_info = NULL;
 	struct sn_irq_info *sn_irq_info = NULL;
-	size_t size;
+	size_t image_size, size;
 
 	if (sn_acpi_get_pcidev_info(dev, &pcidev_info, &sn_irq_info)) {
 		panic("%s:  Failure obtaining pcidev_info for %s\n",
@@ -428,17 +428,16 @@ sn_acpi_slot_fixup(struct pci_dev *dev)
 	if (pcidev_info->pdi_pio_mapped_addr[PCI_ROM_RESOURCE]) {
 		/*
 		 * A valid ROM image exists and has been shadowed by the
-		 * PROM. Setup the pci_dev ROM resource to point to
-		 * the shadowed copy.
+		 * PROM. Setup the pci_dev ROM resource with the address
+		 * of the shadowed copy, and the actual length of the ROM image.
 		 */
-		size = dev->resource[PCI_ROM_RESOURCE].end -
-				dev->resource[PCI_ROM_RESOURCE].start;
-		addr =
-		     ioremap(pcidev_info->pdi_pio_mapped_addr[PCI_ROM_RESOURCE],
-			     size);
+		size = pci_resource_len(dev, PCI_ROM_RESOURCE);
+		addr = ioremap(pcidev_info->pdi_pio_mapped_addr[PCI_ROM_RESOURCE],
+			       size);
+		image_size = pci_get_rom_size(addr, size);
 		dev->resource[PCI_ROM_RESOURCE].start = (unsigned long) addr;
 		dev->resource[PCI_ROM_RESOURCE].end =
-						(unsigned long) addr + size;
+					(unsigned long) addr + image_size - 1;
 		dev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_BIOS_COPY;
 	}
 	sn_pci_fixup_slot(dev, pcidev_info, sn_irq_info);
diff --git a/arch/ia64/sn/kernel/io_init.c b/arch/ia64/sn/kernel/io_init.c
index 6b10e5d28488..906b93674b76 100644
--- a/arch/ia64/sn/kernel/io_init.c
+++ b/arch/ia64/sn/kernel/io_init.c
@@ -259,9 +259,23 @@ sn_io_slot_fixup(struct pci_dev *dev)
 			insert_resource(&ioport_resource, &dev->resource[idx]);
 		else
 			insert_resource(&iomem_resource, &dev->resource[idx]);
-		/* If ROM, mark as shadowed in PROM */
-		if (idx == PCI_ROM_RESOURCE)
-			dev->resource[idx].flags |= IORESOURCE_ROM_BIOS_COPY;
+		/*
+		 * If ROM, set the actual ROM image size, and mark as
+		 * shadowed in PROM.
+		 */
+		if (idx == PCI_ROM_RESOURCE) {
+			size_t image_size;
+			void __iomem *rom;
+
+			rom = ioremap(pci_resource_start(dev, PCI_ROM_RESOURCE),
+				      size + 1);
+			image_size = pci_get_rom_size(rom, size + 1);
+			dev->resource[PCI_ROM_RESOURCE].end =
+				dev->resource[PCI_ROM_RESOURCE].start +
+				image_size - 1;
+			dev->resource[PCI_ROM_RESOURCE].flags |=
+						 IORESOURCE_ROM_BIOS_COPY;
+		}
 	}
 	/* Create a pci_window in the pci_controller struct for
 	 * each device resource.
diff --git a/drivers/pci/rom.c b/drivers/pci/rom.c
index d087e0817715..dbbcc04abd1a 100644
--- a/drivers/pci/rom.c
+++ b/drivers/pci/rom.c
@@ -53,6 +53,49 @@ static void pci_disable_rom(struct pci_dev *pdev)
 	pci_write_config_dword(pdev, pdev->rom_base_reg, rom_addr);
 }
 
+/**
+ * pci_get_rom_size - obtain the actual size of the ROM image
+ * @rom: kernel virtual pointer to image of ROM
+ * @size: size of PCI window
+ *  return: size of actual ROM image
+ *
+ * Determine the actual length of the ROM image.
+ * The PCI window size could be much larger than the
+ * actual image size.
+ */
+size_t pci_get_rom_size(void __iomem *rom, size_t size)
+{
+	void __iomem *image;
+	int last_image;
+
+	image = rom;
+	do {
+		void __iomem *pds;
+		/* Standard PCI ROMs start out with these bytes 55 AA */
+		if (readb(image) != 0x55)
+			break;
+		if (readb(image + 1) != 0xAA)
+			break;
+		/* get the PCI data structure and check its signature */
+		pds = image + readw(image + 24);
+		if (readb(pds) != 'P')
+			break;
+		if (readb(pds + 1) != 'C')
+			break;
+		if (readb(pds + 2) != 'I')
+			break;
+		if (readb(pds + 3) != 'R')
+			break;
+		last_image = readb(pds + 21) & 0x80;
+		/* this length is reliable */
+		image += readw(pds + 16) * 512;
+	} while (!last_image);
+
+	/* never return a size larger than the PCI resource window */
+	/* there are known ROMs that get the size wrong */
+	return min((size_t)(image - rom), size);
+}
+
 /**
  * pci_map_rom - map a PCI ROM to kernel space
  * @pdev: pointer to pci device struct
@@ -68,8 +111,6 @@ void __iomem *pci_map_rom(struct pci_dev *pdev, size_t *size)
 	struct resource *res = &pdev->resource[PCI_ROM_RESOURCE];
 	loff_t start;
 	void __iomem *rom;
-	void __iomem *image;
-	int last_image;
 
 	/*
 	 * IORESOURCE_ROM_SHADOW set on x86, x86_64 and IA64 supports legacy
@@ -117,33 +158,7 @@ void __iomem *pci_map_rom(struct pci_dev *pdev, size_t *size)
 	 * size is much larger than the actual size of the ROM.
 	 * True size is important if the ROM is going to be copied.
 	 */
-	image = rom;
-	do {
-		void __iomem *pds;
-		/* Standard PCI ROMs start out with these bytes 55 AA */
-		if (readb(image) != 0x55)
-			break;
-		if (readb(image + 1) != 0xAA)
-			break;
-		/* get the PCI data structure and check its signature */
-		pds = image + readw(image + 24);
-		if (readb(pds) != 'P')
-			break;
-		if (readb(pds + 1) != 'C')
-			break;
-		if (readb(pds + 2) != 'I')
-			break;
-		if (readb(pds + 3) != 'R')
-			break;
-		last_image = readb(pds + 21) & 0x80;
-		/* this length is reliable */
-		image += readw(pds + 16) * 512;
-	} while (!last_image);
-
-	/* never return a size larger than the PCI resource window */
-	/* there are known ROMs that get the size wrong */
-	*size = min((size_t)(image - rom), *size);
-
+	*size = pci_get_rom_size(rom, *size);
 	return rom;
 }
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 086a0e5a6318..acb9387c0364 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -560,6 +560,7 @@ void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size);
 void __iomem __must_check *pci_map_rom_copy(struct pci_dev *pdev, size_t *size);
 void pci_unmap_rom(struct pci_dev *pdev, void __iomem *rom);
 void pci_remove_rom(struct pci_dev *pdev);
+size_t pci_get_rom_size(void __iomem *rom, size_t size);
 
 /* Power management related routines */
 int pci_save_state(struct pci_dev *dev);
-- 
cgit v1.3-8-gc7d7


From 149983af609e8f5c57157467baf8545d17b8a6a1 Mon Sep 17 00:00:00 2001
From: Dotan Barak <dotanb@dev.mellanox.co.il>
Date: Tue, 26 Jun 2007 15:55:28 +0300
Subject: mlx4_core: Get the maximum message size from reported device
 capabilities

Get the maximum message size from the device capabilities returned
from the QUERY_DEV_CAP firmware command, rather than hard-coding 2 GB.

Signed-off-by: Dotan Barak <dotanb@dev.mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst@dev.mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mlx4/main.c | 2 +-
 drivers/net/mlx4/fw.c             | 3 +++
 drivers/net/mlx4/fw.h             | 1 +
 drivers/net/mlx4/main.c           | 1 +
 include/linux/mlx4/device.h       | 1 +
 5 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index c591616dccde..2fc8ccebaac1 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -169,7 +169,7 @@ static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
 	props->phys_state	= out_mad->data[33] >> 4;
 	props->port_cap_flags	= be32_to_cpup((__be32 *) (out_mad->data + 20));
 	props->gid_tbl_len	= to_mdev(ibdev)->dev->caps.gid_table_len[port];
-	props->max_msg_sz	= 0x80000000;
+	props->max_msg_sz	= to_mdev(ibdev)->dev->caps.max_msg_sz;
 	props->pkey_tbl_len	= to_mdev(ibdev)->dev->caps.pkey_table_len[port];
 	props->bad_pkey_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 46));
 	props->qkey_viol_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 48));
diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index d2b065351e45..c45cbe43a0c4 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -138,6 +138,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_ACK_DELAY_OFFSET		0x35
 #define QUERY_DEV_CAP_MTU_WIDTH_OFFSET		0x36
 #define QUERY_DEV_CAP_VL_PORT_OFFSET		0x37
+#define QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET		0x38
 #define QUERY_DEV_CAP_MAX_GID_OFFSET		0x3b
 #define QUERY_DEV_CAP_RATE_SUPPORT_OFFSET	0x3c
 #define QUERY_DEV_CAP_MAX_PKEY_OFFSET		0x3f
@@ -220,6 +221,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev_cap->local_ca_ack_delay = field & 0x1f;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
 	dev_cap->num_ports = field & 0xf;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET);
+	dev_cap->max_msg_sz = 1 << (field & 0x1f);
 	MLX4_GET(stat_rate, outbox, QUERY_DEV_CAP_RATE_SUPPORT_OFFSET);
 	dev_cap->stat_rate_support = stat_rate;
 	MLX4_GET(dev_cap->flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET);
diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h
index 296254ac27c1..7e1dd9e25cfb 100644
--- a/drivers/net/mlx4/fw.h
+++ b/drivers/net/mlx4/fw.h
@@ -60,6 +60,7 @@ struct mlx4_dev_cap {
 	int max_rdma_global;
 	int local_ca_ack_delay;
 	int num_ports;
+	u32 max_msg_sz;
 	int max_mtu[MLX4_MAX_PORTS + 1];
 	int max_port_width[MLX4_MAX_PORTS + 1];
 	int max_vl[MLX4_MAX_PORTS + 1];
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index c3da2a2f5431..a4f2e0475a71 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -154,6 +154,7 @@ static int __devinit mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev
 	dev->caps.reserved_uars	     = dev_cap->reserved_uars;
 	dev->caps.reserved_pds	     = dev_cap->reserved_pds;
 	dev->caps.mtt_entry_sz	     = MLX4_MTT_ENTRY_PER_SEG * dev_cap->mtt_entry_sz;
+	dev->caps.max_msg_sz         = dev_cap->max_msg_sz;
 	dev->caps.page_size_cap	     = ~(u32) (dev_cap->min_page_sz - 1);
 	dev->caps.flags		     = dev_cap->flags;
 	dev->caps.stat_rate_support  = dev_cap->stat_rate_support;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index b372f5910fc1..8209387ee854 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -172,6 +172,7 @@ struct mlx4_caps {
 	int			num_pds;
 	int			reserved_pds;
 	int			mtt_entry_sz;
+	u32			max_msg_sz;
 	u32			page_size_cap;
 	u32			flags;
 	u16			stat_rate_support;
-- 
cgit v1.3-8-gc7d7


From 80128ff79d282cf71b1819dbca9b8dd47d8ed3e8 Mon Sep 17 00:00:00 2001
From: Vitaly Bordug <vitb@kernel.crashing.org>
Date: Mon, 9 Jul 2007 11:37:35 -0700
Subject: [POWERPC] 8xx: mpc885ads pcmcia support

Adds support for PowerQuicc on-chip PCMCIA.  The driver is implemented as
of_device, so only arch/powerpc stuff is capable to use it, which now implies
only mpc885ads reference board.

To cope with the code that should be hooked inside driver, but is really board
specific (like set_voltage), global structure mpc8xx_pcmcia_ops holds
necessary function pointers that are filled in the BSP code.

[akpm@linux-foundation.org: whitespace diddles]
Signed-off-by: Vitaly Bordug <vitb@kernel.crashing.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Olof Johansson <olof@lixom.net>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/boot/dts/mpc885ads.dts          |  11 +
 arch/powerpc/platforms/8xx/m8xx_setup.c      |   5 +
 arch/powerpc/platforms/8xx/mpc885ads_setup.c |  71 ++++++
 arch/powerpc/sysdev/fsl_soc.c                |  13 +
 arch/powerpc/sysdev/mpc8xx_pic.h             |  11 +-
 drivers/pcmcia/Kconfig                       |  17 +-
 drivers/pcmcia/m8xx_pcmcia.c                 | 351 +++++++++++++--------------
 include/asm-powerpc/mpc8xx.h                 |   4 +
 include/linux/fsl_devices.h                  |   5 +
 9 files changed, 294 insertions(+), 194 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/boot/dts/mpc885ads.dts b/arch/powerpc/boot/dts/mpc885ads.dts
index 110bf6170603..aee01087a932 100644
--- a/arch/powerpc/boot/dts/mpc885ads.dts
+++ b/arch/powerpc/boot/dts/mpc885ads.dts
@@ -112,6 +112,17 @@
 			compatible = "CPM";
 		};
 
+		pcmcia@0080 {
+			#address-cells = <3>;
+			#interrupt-cells = <1>;
+			#size-cells = <2>;
+			compatible = "fsl,pq-pcmcia";
+			device_type = "pcmcia";
+			reg = <80 80>;
+			interrupt-parent = <ff000000>;
+			interrupts = <d 1>;
+		};
+
 		cpm@ff000000 {
 			linux,phandle = <ff000000>;
 			#address-cells = <1>;
diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c b/arch/powerpc/platforms/8xx/m8xx_setup.c
index 0901dbada350..f1693550c70c 100644
--- a/arch/powerpc/platforms/8xx/m8xx_setup.c
+++ b/arch/powerpc/platforms/8xx/m8xx_setup.c
@@ -32,6 +32,7 @@
 #include <linux/root_dev.h>
 #include <linux/time.h>
 #include <linux/rtc.h>
+#include <linux/fsl_devices.h>
 
 #include <asm/mmu.h>
 #include <asm/reg.h>
@@ -49,6 +50,10 @@
 
 #include "sysdev/mpc8xx_pic.h"
 
+#ifdef CONFIG_PCMCIA_M8XX
+struct mpc8xx_pcmcia_ops m8xx_pcmcia_ops;
+#endif
+
 void m8xx_calibrate_decr(void);
 extern void m8xx_wdt_handler_install(bd_t *bp);
 extern int cpm_pic_init(void);
diff --git a/arch/powerpc/platforms/8xx/mpc885ads_setup.c b/arch/powerpc/platforms/8xx/mpc885ads_setup.c
index c36e475d93dc..dc27dab48df0 100644
--- a/arch/powerpc/platforms/8xx/mpc885ads_setup.c
+++ b/arch/powerpc/platforms/8xx/mpc885ads_setup.c
@@ -22,6 +22,7 @@
 
 #include <linux/fs_enet_pd.h>
 #include <linux/fs_uart_pd.h>
+#include <linux/fsl_devices.h>
 #include <linux/mii.h>
 
 #include <asm/delay.h>
@@ -51,6 +52,70 @@ static void init_smc1_uart_ioports(struct fs_uart_platform_info* fpi);
 static void init_smc2_uart_ioports(struct fs_uart_platform_info* fpi);
 static void init_scc3_ioports(struct fs_platform_info* ptr);
 
+#ifdef CONFIG_PCMCIA_M8XX
+static void pcmcia_hw_setup(int slot, int enable)
+{
+	unsigned *bcsr_io;
+
+	bcsr_io = ioremap(BCSR1, sizeof(unsigned long));
+	if (enable)
+		clrbits32(bcsr_io, BCSR1_PCCEN);
+	else
+		setbits32(bcsr_io, BCSR1_PCCEN);
+
+	iounmap(bcsr_io);
+}
+
+static int pcmcia_set_voltage(int slot, int vcc, int vpp)
+{
+	u32 reg = 0;
+	unsigned *bcsr_io;
+
+	bcsr_io = ioremap(BCSR1, sizeof(unsigned long));
+
+	switch(vcc) {
+	case 0:
+		break;
+	case 33:
+		reg |= BCSR1_PCCVCC0;
+		break;
+	case 50:
+		reg |= BCSR1_PCCVCC1;
+		break;
+	default:
+		return 1;
+	}
+
+	switch(vpp) {
+	case 0:
+		break;
+	case 33:
+	case 50:
+		if(vcc == vpp)
+			reg |= BCSR1_PCCVPP1;
+		else
+			return 1;
+		break;
+	case 120:
+		if ((vcc == 33) || (vcc == 50))
+			reg |= BCSR1_PCCVPP0;
+		else
+			return 1;
+	default:
+		return 1;
+	}
+
+	/* first, turn off all power */
+	clrbits32(bcsr_io, 0x00610000);
+
+	/* enable new powersettings */
+	setbits32(bcsr_io, reg);
+
+	iounmap(bcsr_io);
+	return 0;
+}
+#endif
+
 void __init mpc885ads_board_setup(void)
 {
 	cpm8xx_t *cp;
@@ -115,6 +180,12 @@ void __init mpc885ads_board_setup(void)
 	immr_unmap(io_port);
 
 #endif
+
+#ifdef CONFIG_PCMCIA_M8XX
+	/*Set up board specific hook-ups*/
+	m8xx_pcmcia_ops.hw_ctrl = pcmcia_hw_setup;
+	m8xx_pcmcia_ops.voltage_set = pcmcia_set_voltage;
+#endif
 }
 
 
diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c
index cad175724359..c0ddc80d8160 100644
--- a/arch/powerpc/sysdev/fsl_soc.c
+++ b/arch/powerpc/sysdev/fsl_soc.c
@@ -1028,6 +1028,19 @@ err:
 
 arch_initcall(fs_enet_of_init);
 
+static int __init fsl_pcmcia_of_init(void)
+{
+	struct device_node *np = NULL;
+	/*
+	 * Register all the devices which type is "pcmcia"
+	 */
+	while ((np = of_find_compatible_node(np,
+			"pcmcia", "fsl,pq-pcmcia")) != NULL)
+			    of_platform_device_create(np, "m8xx-pcmcia", NULL);
+	return 0;
+}
+
+arch_initcall(fsl_pcmcia_of_init);
 
 static const char *smc_regs = "regs";
 static const char *smc_pram = "pram";
diff --git a/arch/powerpc/sysdev/mpc8xx_pic.h b/arch/powerpc/sysdev/mpc8xx_pic.h
index afa2ee6717c1..9fe00eebdc8b 100644
--- a/arch/powerpc/sysdev/mpc8xx_pic.h
+++ b/arch/powerpc/sysdev/mpc8xx_pic.h
@@ -4,9 +4,16 @@
 #include <linux/irq.h>
 #include <linux/interrupt.h>
 
-extern struct hw_interrupt_type mpc8xx_pic;
-
 int mpc8xx_pic_init(void);
 unsigned int mpc8xx_get_irq(void);
 
+/*
+ * Some internal interrupt registers use an 8-bit mask for the interrupt
+ * level instead of a number.
+ */
+static inline uint mk_int_int_mask(uint mask)
+{
+	return (1 << (7 - (mask/2)));
+}
+
 #endif /* _PPC_KERNEL_PPC8xx_H */
diff --git a/drivers/pcmcia/Kconfig b/drivers/pcmcia/Kconfig
index 35f88649d3b7..c0c77f82d051 100644
--- a/drivers/pcmcia/Kconfig
+++ b/drivers/pcmcia/Kconfig
@@ -180,14 +180,15 @@ config TCIC
 	  PCMCIA cards are plugged into. If unsure, say N.
 
 config PCMCIA_M8XX
-        tristate "MPC8xx PCMCIA support"
-        depends on PCMCIA && PPC && 8xx 
-        select PCCARD_IODYN
-        help
-        Say Y here to include support for PowerPC 8xx series PCMCIA
-        controller.
-
-        This driver is also available as a module called m8xx_pcmcia.
+	tristate "MPC8xx PCMCIA support"
+	depends on PCMCIA && PPC && 8xx
+	select PCCARD_IODYN
+	select PCCARD_NONSTATIC
+	help
+	  Say Y here to include support for PowerPC 8xx series PCMCIA
+	  controller.
+
+	  This driver is also available as a module called m8xx_pcmcia.
 
 config HD64465_PCMCIA
 	tristate "HD64465 host bridge support"
diff --git a/drivers/pcmcia/m8xx_pcmcia.c b/drivers/pcmcia/m8xx_pcmcia.c
index 9721ed7bf502..3b40f9623cc9 100644
--- a/drivers/pcmcia/m8xx_pcmcia.c
+++ b/drivers/pcmcia/m8xx_pcmcia.c
@@ -10,7 +10,7 @@
  * Further fixes, v2.6 kernel port
  *     <marcelo.tosatti@cyclades.com>
  * 
- * Some fixes, additions (C) 2005 Montavista Software, Inc. 
+ * Some fixes, additions (C) 2005-2007 Montavista Software, Inc.
  *     <vbordug@ru.mvista.com>
  *
  * "The ExCA standard specifies that socket controllers should provide
@@ -40,10 +40,6 @@
 #include <linux/fcntl.h>
 #include <linux/string.h>
 
-#include <asm/io.h>
-#include <asm/bitops.h>
-#include <asm/system.h>
-
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
@@ -51,11 +47,18 @@
 #include <linux/ioport.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
-#include <linux/platform_device.h>
+#include <linux/fsl_devices.h>
 
+#include <asm/io.h>
+#include <asm/bitops.h>
+#include <asm/system.h>
+#include <asm/time.h>
 #include <asm/mpc8xx.h>
 #include <asm/8xx_immap.h>
 #include <asm/irq.h>
+#include <asm/fs_pd.h>
+#include <asm/of_device.h>
+#include <asm/of_platform.h>
 
 #include <pcmcia/version.h>
 #include <pcmcia/cs_types.h>
@@ -146,27 +149,17 @@ MODULE_LICENSE("Dual MPL/GPL");
 #define PCMCIA_MEM_WIN_BASE 0xe0000000 /* base address for memory window 0   */
 #define PCMCIA_MEM_WIN_SIZE 0x04000000 /* each memory window is 64 MByte     */
 #define PCMCIA_IO_WIN_BASE  _IO_BASE   /* base address for io window 0       */
-
-#define PCMCIA_SCHLVL PCMCIA_INTERRUPT /* Status Change Interrupt Level      */
-
 /* ------------------------------------------------------------------------- */
 
-/* 2.4.x and newer has this always in HZ */
-#define M8XX_BUSFREQ ((((bd_t *)&(__res))->bi_busfreq))
-
-static int pcmcia_schlvl = PCMCIA_SCHLVL;
+static int pcmcia_schlvl;
 
 static DEFINE_SPINLOCK(events_lock);
 
-
 #define PCMCIA_SOCKET_KEY_5V 1
 #define PCMCIA_SOCKET_KEY_LV 2
 
 /* look up table for pgcrx registers */
-static u32 *m8xx_pgcrx[2] = {
-	&((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_pgcra,
-	&((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_pgcrb
-};
+static u32 *m8xx_pgcrx[2];
 
 /*
  * This structure is used to address each window in the PCMCIA controller.
@@ -228,11 +221,16 @@ struct event_table {
 	u32 eventbit;
 };
 
+static const char driver_name[] = "m8xx-pcmcia";
+
 struct socket_info {
 	void	(*handler)(void *info, u32 events);
 	void	*info;
 
 	u32 slot;
+	pcmconf8xx_t *pcmcia;
+	u32 bus_freq;
+	int hwirq;
 
 	socket_state_t state;
 	struct pccard_mem_map mem_win[PCMCIA_MEM_WIN_NO];
@@ -408,78 +406,21 @@ static void hardware_disable(int slot)
 #if defined(CONFIG_MPC885ADS)
 
 #define PCMCIA_BOARD_MSG "MPC885ADS"
+#define socket_get(_slot_) PCMCIA_SOCKET_KEY_5V
 
-static int voltage_set(int slot, int vcc, int vpp)
+static inline void hardware_enable(int slot)
 {
-	u32 reg = 0;
-	unsigned *bcsr_io;
-
-	bcsr_io = ioremap(BCSR1, sizeof(unsigned long));
-
-	switch(vcc) {
-		case 0:
-			break;
-		case 33:
-			reg |= BCSR1_PCCVCC0;
-			break;
-		case 50:
-			reg |= BCSR1_PCCVCC1;
-			break;
-		default:
-			goto out_unmap;
-	}
-
-	switch(vpp) {
-		case 0:
-			break;
-		case 33:
-		case 50:
-			if(vcc == vpp)
-				reg |= BCSR1_PCCVPP1;
-			else
-				goto out_unmap;
-			break;
-		case 120:
-			if ((vcc == 33) || (vcc == 50))
-				reg |= BCSR1_PCCVPP0;
-			else
-				goto out_unmap;
-		default:
-			goto out_unmap;
-	}
-
-	/* first, turn off all power */
-	out_be32(bcsr_io, in_be32(bcsr_io) & ~(BCSR1_PCCVCC_MASK | BCSR1_PCCVPP_MASK));
-
-	/* enable new powersettings */
-	out_be32(bcsr_io, in_be32(bcsr_io) | reg);
-
-	iounmap(bcsr_io);
-	return 0;
-
-out_unmap:
-	iounmap(bcsr_io);
-	return 1;
+	 m8xx_pcmcia_ops.hw_ctrl(slot, 1);
 }
 
-#define socket_get(_slot_) PCMCIA_SOCKET_KEY_5V
-
-static void hardware_enable(int slot)
+static inline void hardware_disable(int slot)
 {
-	unsigned *bcsr_io;
-
-	bcsr_io = ioremap(BCSR1, sizeof(unsigned long));
-	out_be32(bcsr_io, in_be32(bcsr_io) & ~BCSR1_PCCEN);
-	iounmap(bcsr_io);
+	m8xx_pcmcia_ops.hw_ctrl(slot, 0);
 }
 
-static void hardware_disable(int slot)
+static inline int voltage_set(int slot, int vcc, int vpp)
 {
-	unsigned *bcsr_io;
-
-	bcsr_io = ioremap(BCSR1, sizeof(unsigned long));
-	out_be32(bcsr_io, in_be32(bcsr_io) |  BCSR1_PCCEN);
-	iounmap(bcsr_io);
+	return m8xx_pcmcia_ops.voltage_set(slot, vcc, vpp);
 }
 
 #endif
@@ -604,48 +545,6 @@ static int voltage_set(int slot, int vcc, int vpp)
 
 #endif /* CONFIG_PRxK */
 
-static void m8xx_shutdown(void)
-{
-	u32 m, i;
-	struct pcmcia_win *w;
-
-	for(i = 0; i < PCMCIA_SOCKETS_NO; i++){
-		w = (void *) &((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_pbr0;
-
-		out_be32(&((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_pscr, M8XX_PCMCIA_MASK(i));
-		out_be32(&((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_per, in_be32(&((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_per) & ~M8XX_PCMCIA_MASK(i));
-
-		/* turn off interrupt and disable CxOE */
-		out_be32(M8XX_PGCRX(i), M8XX_PGCRX_CXOE);
-
-		/* turn off memory windows */
-		for(m = 0; m < PCMCIA_MEM_WIN_NO; m++) {
-			out_be32(&w->or, 0); /* set to not valid */
-			w++;
-		}
-
-		/* turn off voltage */
-		voltage_set(i, 0, 0);
-
-		/* disable external hardware */
-		hardware_disable(i);
-	}
-
-	free_irq(pcmcia_schlvl, NULL);
-}
-
-static struct device_driver m8xx_driver = {
-        .name = "m8xx-pcmcia",
-        .bus = &platform_bus_type,
-        .suspend = pcmcia_socket_dev_suspend,
-        .resume = pcmcia_socket_dev_resume,
-};
-
-static struct platform_device m8xx_device = {
-        .name = "m8xx-pcmcia",
-        .id = 0,
-};
-
 static u32 pending_events[PCMCIA_SOCKETS_NO];
 static DEFINE_SPINLOCK(pending_event_lock);
 
@@ -654,13 +553,14 @@ static irqreturn_t m8xx_interrupt(int irq, void *dev)
 	struct socket_info *s;
 	struct event_table *e;
 	unsigned int i, events, pscr, pipr, per;
+	pcmconf8xx_t	*pcmcia = socket[0].pcmcia;
 
 	dprintk("Interrupt!\n");
 	/* get interrupt sources */
 
-	pscr = in_be32(&((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_pscr);
-	pipr = in_be32(&((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_pipr);
-	per = in_be32(&((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_per);
+	pscr = in_be32(&pcmcia->pcmc_pscr);
+	pipr = in_be32(&pcmcia->pcmc_pipr);
+	per = in_be32(&pcmcia->pcmc_per);
 
 	for(i = 0; i < PCMCIA_SOCKETS_NO; i++) {
 		s = &socket[i];
@@ -724,7 +624,7 @@ static irqreturn_t m8xx_interrupt(int irq, void *dev)
 			per &= ~M8XX_PCMCIA_RDY_L(0);
 			per &= ~M8XX_PCMCIA_RDY_L(1);
 
-			out_be32(&((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_per, per);
+			out_be32(&pcmcia->pcmc_per, per);
 
 			if (events)
 				pcmcia_parse_events(&socket[i].socket, events);
@@ -732,7 +632,7 @@ static irqreturn_t m8xx_interrupt(int irq, void *dev)
 	}
 
 	/* clear the interrupt sources */
-	out_be32(&((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_pscr, pscr);
+	out_be32(&pcmcia->pcmc_pscr, pscr);
 
 	dprintk("Interrupt done.\n");
 
@@ -753,7 +653,7 @@ static u32 m8xx_get_graycode(u32 size)
 	return k;
 }
 
-static u32 m8xx_get_speed(u32 ns, u32 is_io)
+static u32 m8xx_get_speed(u32 ns, u32 is_io, u32 bus_freq)
 {
 	u32 reg, clocks, psst, psl, psht;
 
@@ -781,7 +681,7 @@ static u32 m8xx_get_speed(u32 ns, u32 is_io)
 
 #define ADJ 180 /* 80 % longer accesstime - to be sure */
 
-	clocks = ((M8XX_BUSFREQ / 1000) * ns) / 1000;
+	clocks = ((bus_freq / 1000) * ns) / 1000;
 	clocks = (clocks * ADJ) / (100*1000);
 	if(clocks >= PCMCIA_BMT_LIMIT) {
 		printk( "Max access time limit reached\n");
@@ -806,8 +706,9 @@ static int m8xx_get_status(struct pcmcia_socket *sock, unsigned int *value)
 	int lsock = container_of(sock, struct socket_info, socket)->slot;
 	struct socket_info *s = &socket[lsock];
 	unsigned int pipr, reg;
+	pcmconf8xx_t *pcmcia = s->pcmcia;
 
-	pipr = in_be32(&((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_pipr);
+	pipr = in_be32(&pcmcia->pcmc_pipr);
 
 	*value  = ((pipr & (M8XX_PCMCIA_CD1(lsock)
 			    | M8XX_PCMCIA_CD2(lsock))) == 0) ? SS_DETECT : 0;
@@ -918,6 +819,7 @@ static int m8xx_set_socket(struct pcmcia_socket *sock, socket_state_t *state)
 	struct event_table *e;
 	unsigned int reg;
 	unsigned long flags;
+	pcmconf8xx_t *pcmcia = socket[0].pcmcia;
 
 	dprintk( "SetSocket(%d, flags %#3.3x, Vcc %d, Vpp %d, "
 	      "io_irq %d, csc_mask %#2.2x)\n", lsock, state->flags,
@@ -927,6 +829,7 @@ static int m8xx_set_socket(struct pcmcia_socket *sock, socket_state_t *state)
 	if(voltage_set(lsock, state->Vcc, state->Vpp))
 		return -EINVAL;
 
+
 	/* Take care of reset... */
 	if(state->flags & SS_RESET)
 		out_be32(M8XX_PGCRX(lsock), in_be32(M8XX_PGCRX(lsock)) |  M8XX_PGCRX_CXRESET); /* active high */
@@ -982,7 +885,8 @@ static int m8xx_set_socket(struct pcmcia_socket *sock, socket_state_t *state)
 		 * If io_irq is non-zero we should enable irq.
 		 */
 		if(state->io_irq) {
-			out_be32(M8XX_PGCRX(lsock), in_be32(M8XX_PGCRX(lsock)) | mk_int_int_mask(state->io_irq) << 24);
+			out_be32(M8XX_PGCRX(lsock),
+				 in_be32(M8XX_PGCRX(lsock)) | mk_int_int_mask(s->hwirq) << 24);
 			/*
 			 * Strange thing here:
 			 * The manual does not tell us which interrupt
@@ -1027,7 +931,7 @@ static int m8xx_set_socket(struct pcmcia_socket *sock, socket_state_t *state)
 	 * Writing ones will clear the bits.
 	 */
 
-	out_be32(&((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_pscr, reg);
+	out_be32(&pcmcia->pcmc_pscr, reg);
 
 	/*
 	 * Write the mask.
@@ -1036,15 +940,8 @@ static int m8xx_set_socket(struct pcmcia_socket *sock, socket_state_t *state)
 	 * Ones will enable the interrupt.
 	 */
 
-	/*
-	  reg |= ((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_per
-	  & M8XX_PCMCIA_MASK(lsock);
-	*/
-
-	reg |= in_be32(&((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_per) &
-		(M8XX_PCMCIA_MASK(0) | M8XX_PCMCIA_MASK(1));
-
-	out_be32(&((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_per, reg);
+	reg |= in_be32(&pcmcia->pcmc_per) & (M8XX_PCMCIA_MASK(0) | M8XX_PCMCIA_MASK(1));
+	out_be32(&pcmcia->pcmc_per, reg);
 
 	spin_unlock_irqrestore(&events_lock, flags);
 
@@ -1062,6 +959,8 @@ static int m8xx_set_io_map(struct pcmcia_socket *sock, struct pccard_io_map *io)
 	struct socket_info *s = &socket[lsock];
 	struct pcmcia_win *w;
 	unsigned int reg, winnr;
+	pcmconf8xx_t *pcmcia = s->pcmcia;
+
 
 #define M8XX_SIZE (io->stop - io->start + 1)
 #define M8XX_BASE (PCMCIA_IO_WIN_BASE + io->start)
@@ -1086,7 +985,7 @@ static int m8xx_set_io_map(struct pcmcia_socket *sock, struct pccard_io_map *io)
 
 		/* setup registers */
 
-		w = (void *) &((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_pbr0;
+		w = (void *) &pcmcia->pcmc_pbr0;
 		w += winnr;
 
 		out_be32(&w->or, 0); /* turn off window first */
@@ -1095,12 +994,13 @@ static int m8xx_set_io_map(struct pcmcia_socket *sock, struct pccard_io_map *io)
 		reg <<= 27;
   		reg |= M8XX_PCMCIA_POR_IO |(lsock << 2);
 
-		reg |= m8xx_get_speed(io->speed, 1);
+		reg |= m8xx_get_speed(io->speed, 1, s->bus_freq);
 
 		if(io->flags & MAP_WRPROT)
 			reg |= M8XX_PCMCIA_POR_WRPROT;
 
-		if(io->flags & (MAP_16BIT | MAP_AUTOSZ))
+		/*if(io->flags & (MAP_16BIT | MAP_AUTOSZ))*/
+		if(io->flags & MAP_16BIT)
 			reg |= M8XX_PCMCIA_POR_16BIT;
 
 		if(io->flags & MAP_ACTIVE)
@@ -1117,7 +1017,7 @@ static int m8xx_set_io_map(struct pcmcia_socket *sock, struct pccard_io_map *io)
 
 		/* setup registers */
 
-		w = (void *) &((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_pbr0;
+		w = (void *) &pcmcia->pcmc_pbr0;
 		w += winnr;
 
 		out_be32(&w->or, 0); /* turn off window */
@@ -1144,6 +1044,7 @@ static int m8xx_set_mem_map(struct pcmcia_socket *sock, struct pccard_mem_map *m
 	struct pcmcia_win *w;
 	struct pccard_mem_map *old;
 	unsigned int reg, winnr;
+	pcmconf8xx_t *pcmcia = s->pcmcia;
 
 	dprintk( "SetMemMap(%d, %d, %#2.2x, %d ns, "
 	      "%#5.5lx, %#5.5x)\n", lsock, mem->map, mem->flags,
@@ -1166,12 +1067,12 @@ static int m8xx_set_mem_map(struct pcmcia_socket *sock, struct pccard_mem_map *m
 
 	/* Setup the window in the pcmcia controller */
 
-	w = (void *) &((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_pbr0;
+	w = (void *) &pcmcia->pcmc_pbr0;
 	w += winnr;
 
 	reg |= lsock << 2;
 
-	reg |= m8xx_get_speed(mem->speed, 0);
+	reg |= m8xx_get_speed(mem->speed, 0, s->bus_freq);
 
 	if(mem->flags & MAP_ATTRIB)
 		reg |=  M8XX_PCMCIA_POR_ATTRMEM;
@@ -1236,60 +1137,69 @@ static int m8xx_sock_init(struct pcmcia_socket *sock)
 
 }
 
-static int m8xx_suspend(struct pcmcia_socket *sock)
+static int m8xx_sock_suspend(struct pcmcia_socket *sock)
 {
 	return m8xx_set_socket(sock, &dead_socket);
 }
 
 static struct pccard_operations m8xx_services = {
 	.init	= m8xx_sock_init,
-	.suspend = m8xx_suspend,
+	.suspend = m8xx_sock_suspend,
 	.get_status = m8xx_get_status,
 	.set_socket = m8xx_set_socket,
 	.set_io_map = m8xx_set_io_map,
 	.set_mem_map = m8xx_set_mem_map,
 };
 
-static int __init m8xx_init(void)
+static int __init m8xx_probe(struct of_device *ofdev, const struct of_device_id *match)
 {
 	struct pcmcia_win *w;
-	unsigned int i,m;
+	unsigned int i, m, hwirq;
+	pcmconf8xx_t *pcmcia;
+	int status;
+	struct device_node *np = ofdev->node;
 
 	pcmcia_info("%s\n", version);
 
-	if (driver_register(&m8xx_driver))
-		return -1;
+	pcmcia = of_iomap(np, 0);
+	if(pcmcia == NULL)
+		return -EINVAL;
+
+	pcmcia_schlvl = irq_of_parse_and_map(np, 0);
+	hwirq  = irq_map[pcmcia_schlvl].hwirq;
+	if (pcmcia_schlvl < 0)
+		return -EINVAL;
+
+	m8xx_pgcrx[0] = &pcmcia->pcmc_pgcra;
+	m8xx_pgcrx[1] = &pcmcia->pcmc_pgcrb;
+
 
 	pcmcia_info(PCMCIA_BOARD_MSG " using " PCMCIA_SLOT_MSG
-		    " with IRQ %u.\n", pcmcia_schlvl);
+		    " with IRQ %u  (%d). \n", pcmcia_schlvl, hwirq);
 
 	/* Configure Status change interrupt */
 
-	if(request_irq(pcmcia_schlvl, m8xx_interrupt, 0,
-			  "m8xx_pcmcia", NULL)) {
+	if(request_irq(pcmcia_schlvl, m8xx_interrupt, IRQF_SHARED,
+			  driver_name, socket)) {
 		pcmcia_error("Cannot allocate IRQ %u for SCHLVL!\n",
 			     pcmcia_schlvl);
 		return -1;
 	}
 
-	w = (void *) &((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_pbr0;
-
-	out_be32(&((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_pscr,
-		M8XX_PCMCIA_MASK(0)| M8XX_PCMCIA_MASK(1));
+	w = (void *) &pcmcia->pcmc_pbr0;
 
-	out_be32(&((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_per,
-		in_be32(&((immap_t *)IMAP_ADDR)->im_pcmcia.pcmc_per) &
-		~(M8XX_PCMCIA_MASK(0)| M8XX_PCMCIA_MASK(1)));
+	out_be32(&pcmcia->pcmc_pscr, M8XX_PCMCIA_MASK(0)| M8XX_PCMCIA_MASK(1));
+	clrbits32(&pcmcia->pcmc_per, M8XX_PCMCIA_MASK(0) | M8XX_PCMCIA_MASK(1));
 
-/* connect interrupt and disable CxOE */
+	/* connect interrupt and disable CxOE */
 
-	out_be32(M8XX_PGCRX(0), M8XX_PGCRX_CXOE | (mk_int_int_mask(pcmcia_schlvl) << 16));
-	out_be32(M8XX_PGCRX(1), M8XX_PGCRX_CXOE | (mk_int_int_mask(pcmcia_schlvl) << 16));
+	out_be32(M8XX_PGCRX(0), M8XX_PGCRX_CXOE | (mk_int_int_mask(hwirq) << 16));
+	out_be32(M8XX_PGCRX(1), M8XX_PGCRX_CXOE | (mk_int_int_mask(hwirq) << 16));
 
-/* intialize the fixed memory windows */
+	/* intialize the fixed memory windows */
 
 	for(i = 0; i < PCMCIA_SOCKETS_NO; i++){
-		for(m = 0; m < PCMCIA_MEM_WIN_NO; m++) {
+		for (m = 0; m < PCMCIA_MEM_WIN_NO; m++) {
 			out_be32(&w->br, PCMCIA_MEM_WIN_BASE +
 				(PCMCIA_MEM_WIN_SIZE
 				 * (m + i * PCMCIA_MEM_WIN_NO)));
@@ -1300,16 +1210,14 @@ static int __init m8xx_init(void)
 		}
 	}
 
-/* turn off voltage */
+	/* turn off voltage */
 	voltage_set(0, 0, 0);
 	voltage_set(1, 0, 0);
 
-/* Enable external hardware */
+	/* Enable external hardware */
 	hardware_enable(0);
 	hardware_enable(1);
 
-	platform_device_register(&m8xx_device);
-
 	for (i = 0 ; i < PCMCIA_SOCKETS_NO; i++) {
 		socket[i].slot = i;
 		socket[i].socket.owner = THIS_MODULE;
@@ -1317,30 +1225,105 @@ static int __init m8xx_init(void)
 		socket[i].socket.irq_mask = 0x000;
 		socket[i].socket.map_size = 0x1000;
 		socket[i].socket.io_offset = 0;
-		socket[i].socket.pci_irq = i  ? 7 : 9;
+		socket[i].socket.pci_irq = pcmcia_schlvl;
 		socket[i].socket.ops = &m8xx_services;
-		socket[i].socket.resource_ops = &pccard_iodyn_ops;
+		socket[i].socket.resource_ops = &pccard_nonstatic_ops;
 		socket[i].socket.cb_dev = NULL;
-		socket[i].socket.dev.parent = &m8xx_device.dev;
+		socket[i].socket.dev.parent = &ofdev->dev;
+		socket[i].pcmcia = pcmcia;
+		socket[i].bus_freq = ppc_proc_freq;
+		socket[i].hwirq = hwirq;
+
+
 	}
 
-	for (i = 0; i < PCMCIA_SOCKETS_NO; i++)
-		pcmcia_register_socket(&socket[i].socket);
+	for (i = 0; i < PCMCIA_SOCKETS_NO; i++) {
+		status = pcmcia_register_socket(&socket[i].socket);
+		if (status < 0)
+			pcmcia_error("Socket register failed\n");
+	}
 
 	return 0;
 }
 
-static void __exit m8xx_exit(void)
+static int m8xx_remove(struct of_device* ofdev)
 {
-	int i;
+	u32 m, i;
+	struct pcmcia_win *w;
+	pcmconf8xx_t *pcmcia = socket[0].pcmcia;
+
+	for (i = 0; i < PCMCIA_SOCKETS_NO; i++) {
+		w = (void *) &pcmcia->pcmc_pbr0;
+
+		out_be32(&pcmcia->pcmc_pscr, M8XX_PCMCIA_MASK(i));
+		out_be32(&pcmcia->pcmc_per,
+			in_be32(&pcmcia->pcmc_per) & ~M8XX_PCMCIA_MASK(i));
 
+		/* turn off interrupt and disable CxOE */
+		out_be32(M8XX_PGCRX(i), M8XX_PGCRX_CXOE);
+
+		/* turn off memory windows */
+		for (m = 0; m < PCMCIA_MEM_WIN_NO; m++) {
+			out_be32(&w->or, 0); /* set to not valid */
+			w++;
+		}
+
+		/* turn off voltage */
+		voltage_set(i, 0, 0);
+
+		/* disable external hardware */
+		hardware_disable(i);
+	}
 	for (i = 0; i < PCMCIA_SOCKETS_NO; i++)
 		pcmcia_unregister_socket(&socket[i].socket);
 
-	m8xx_shutdown();
+	free_irq(pcmcia_schlvl, NULL);
 
-	platform_device_unregister(&m8xx_device);
-	driver_unregister(&m8xx_driver);
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int m8xx_suspend(struct platform_device *pdev, pm_message_t state)
+{
+	return pcmcia_socket_dev_suspend(&pdev->dev, state);
+}
+
+static int m8xx_resume(struct platform_device *pdev)
+{
+	return pcmcia_socket_dev_resume(&pdev->dev);
+}
+#else
+#define m8xx_suspend NULL
+#define m8xx_resume NULL
+#endif
+
+static struct of_device_id m8xx_pcmcia_match[] = {
+	{
+		.type = "pcmcia",
+		.compatible = "fsl,pq-pcmcia",
+	},
+	{},
+};
+
+MODULE_DEVICE_TABLE(of, m8xx_pcmcia_match);
+
+static struct of_platform_driver m8xx_pcmcia_driver = {
+	.name		= (char *) driver_name,
+	.match_table	= m8xx_pcmcia_match,
+	.probe		= m8xx_probe,
+	.remove		= m8xx_remove,
+	.suspend	= m8xx_suspend,
+	.resume		= m8xx_resume,
+};
+
+static int __init m8xx_init(void)
+{
+	return of_register_platform_driver(&m8xx_pcmcia_driver);
+}
+
+static void __exit m8xx_exit(void)
+{
+	of_unregister_platform_driver(&m8xx_pcmcia_driver);
 }
 
 module_init(m8xx_init);
diff --git a/include/asm-powerpc/mpc8xx.h b/include/asm-powerpc/mpc8xx.h
index 580371120e1a..2be014b6f57c 100644
--- a/include/asm-powerpc/mpc8xx.h
+++ b/include/asm-powerpc/mpc8xx.h
@@ -23,6 +23,10 @@
 #include <platforms/8xx/mpc885ads.h>
 #endif
 
+#ifdef CONFIG_PCMCIA_M8XX
+extern struct mpc8xx_pcmcia_ops m8xx_pcmcia_ops;
+#endif
+
 #endif /* CONFIG_8xx */
 #endif /* __CONFIG_8xx_DEFS */
 #endif /* __KERNEL__ */
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index 73710d617775..12e631f0fb77 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -120,5 +120,10 @@ struct fsl_spi_platform_data {
 	u32	sysclk;
 };
 
+struct mpc8xx_pcmcia_ops {
+	void(*hw_ctrl)(int slot, int enable);
+	int(*voltage_set)(int slot, int vcc, int vpp);
+};
+
 #endif /* _FSL_DEVICE_H_ */
 #endif /* __KERNEL__ */
-- 
cgit v1.3-8-gc7d7


From 4c62b53454a83178676e5ecae6665447d363c7b4 Mon Sep 17 00:00:00 2001
From: Satyam Sharma <ssatyam@cse.iitk.ac.in>
Date: Wed, 27 Jun 2007 16:02:14 +0530
Subject: configfs: misc cleanups

1. item.c:config_item_cleanup() is a private function (only called by
config_item_release() in same file). However, it is spuriously
exported in include/linux/configfs.h, so remove that export and make
it static in item.c. Also, it is no longer exported / interface
function, so no need to give comment for this function (the comment
was stating obvious thing, anyway).

2. Kernel-doc comment format does not allow empty line between end of
comment and start of function (declaration line). There were several
such spurious empty lines in item.c, so fix them.

  fs/configfs/item.c       |   15 +++------------
  include/linux/configfs.h |    1 -
  2 files changed, 3 insertions(+), 13 deletions(-)

Signed-off-by: Satyam Sharma <ssatyam@cse.iitk.ac.in>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/configfs/item.c       | 15 +++------------
 include/linux/configfs.h |  1 -
 2 files changed, 3 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 24421209f854..b762bbeaa0be 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -62,7 +62,6 @@ void config_item_init(struct config_item * item)
  *	dynamically allocated string that @item->ci_name points to.
  *	Otherwise, use the static @item->ci_namebuf array.
  */
-
 int config_item_set_name(struct config_item * item, const char * fmt, ...)
 {
 	int error = 0;
@@ -139,12 +138,7 @@ struct config_item * config_item_get(struct config_item * item)
 	return item;
 }
 
-/**
- *	config_item_cleanup - free config_item resources.
- *	@item:	item.
- */
-
-void config_item_cleanup(struct config_item * item)
+static void config_item_cleanup(struct config_item * item)
 {
 	struct config_item_type * t = item->ci_type;
 	struct config_group * s = item->ci_group;
@@ -179,12 +173,10 @@ void config_item_put(struct config_item * item)
 		kref_put(&item->ci_kref, config_item_release);
 }
 
-
 /**
  *	config_group_init - initialize a group for use
  *	@k:	group
  */
-
 void config_group_init(struct config_group *group)
 {
 	config_item_init(&group->cg_item);
@@ -201,8 +193,8 @@ void config_group_init(struct config_group *group)
  *	looking for a matching config_item. If matching item is found
  *	take a reference and return the item.
  */
-
-struct config_item * config_group_find_obj(struct config_group * group, const char * name)
+struct config_item *config_group_find_obj(struct config_group *group,
+					  const char * name)
 {
 	struct list_head * entry;
 	struct config_item * ret = NULL;
@@ -219,7 +211,6 @@ struct config_item * config_group_find_obj(struct config_group * group, const ch
 	return ret;
 }
 
-
 EXPORT_SYMBOL(config_item_init);
 EXPORT_SYMBOL(config_group_init);
 EXPORT_SYMBOL(config_item_get);
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index fef6f3d0a4a7..3d4a96eb0e9b 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -75,7 +75,6 @@ extern void config_item_init(struct config_item *);
 extern void config_item_init_type_name(struct config_item *item,
 				       const char *name,
 				       struct config_item_type *type);
-extern void config_item_cleanup(struct config_item *);
 
 extern struct config_item * config_item_get(struct config_item *);
 extern void config_item_put(struct config_item *);
-- 
cgit v1.3-8-gc7d7


From 9b1d9aa4e9c5cafe73b9df21d758b50b5d75264d Mon Sep 17 00:00:00 2001
From: Satyam Sharma <ssatyam@cse.iitk.ac.in>
Date: Wed, 4 Jul 2007 16:37:06 +0530
Subject: [PATCH] configfs+dlm: Separate out __CONFIGFS_ATTR into configfs.h

fs/dlm/config.c contains a useful generic macro called __CONFIGFS_ATTR
that is similar to sysfs' __ATTR macro that makes defining attributes
easy for any user of configfs. Separate it out into configfs.h so that
other users (forthcoming in dynamic netconsole patchset) can use it too.

Signed-off-by: Satyam Sharma <ssatyam@cse.iitk.ac.in>
Cc: David Teigland <teigland@redhat.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/dlm/config.c          |  8 --------
 include/linux/configfs.h | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 5069b2cb5a1f..e47eb42406fa 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -133,14 +133,6 @@ static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
 	return len;
 }
 
-#define __CONFIGFS_ATTR(_name,_mode,_read,_write) {                           \
-	.attr   = { .ca_name = __stringify(_name),                            \
-		    .ca_mode = _mode,                                         \
-		    .ca_owner = THIS_MODULE },                                \
-	.show   = _read,                                                      \
-	.store  = _write,                                                     \
-}
-
 #define CLUSTER_ATTR(name, check_zero)                                        \
 static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len)  \
 {                                                                             \
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 3d4a96eb0e9b..def7c83d43a2 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -130,6 +130,22 @@ struct configfs_attribute {
 	mode_t			ca_mode;
 };
 
+/*
+ * Users often need to create attribute structures for their configurable
+ * attributes, containing a configfs_attribute member and function pointers
+ * for the show() and store() operations on that attribute. They can use
+ * this macro (similar to sysfs' __ATTR) to make defining attributes easier.
+ */
+#define __CONFIGFS_ATTR(_name, _mode, _show, _store)			\
+{									\
+	.attr	= {							\
+			.ca_name = __stringify(_name),			\
+			.ca_mode = _mode,				\
+			.ca_owner = THIS_MODULE,			\
+	},								\
+	.show	= _show,						\
+	.store	= _store,						\
+}
 
 /*
  * If allow_link() exists, the item can symlink(2) out to other
-- 
cgit v1.3-8-gc7d7


From 3fe6c5ce1176cf661dbe71fc43b627c1a742a89a Mon Sep 17 00:00:00 2001
From: Satyam Sharma <ssatyam@cse.iitk.ac.in>
Date: Wed, 4 Jul 2007 16:37:16 +0530
Subject: [PATCH] configfs+dlm: Rename config_group_find_obj and state
 semantics clearly

Configfs being based upon sysfs code, config_group_find_obj() is probably
so named because of the similar kset_find_obj() in sysfs. However,
"kobject"s in sysfs become "config_item"s in configfs, so let's call it
config_group_find_item() instead, for sake of uniformity, and make
corresponding change in the users of this function.

BTW a crucial difference between kset_find_obj and config_group_find_item
is in locking expectations. kset_find_obj does its locking by itself, but
config_group_find_item expects the *caller* to do the locking. The reason
for this: kset's have their own locks, config_group's don't but instead
rely on the subsystem mutex. And, subsystem needn't necessarily be around
when config_group_find_item() is called.

So let's state these locking semantics explicitly, and rectify the comment,
otherwise bugs could continue to occur in future, as they did in the past
(refer commit d82b8191e238 in gfs2-2.6-fixes.git).

[ I also took the opportunity to fix some bad whitespace and
double-empty lines. --Joel ]

[ Conflict in fs/dlm/config.c with commit
  3168b0780d06ace875696f8a648d04d6089654e5 manually resolved. --Mark ]

Signed-off-by: Satyam Sharma <ssatyam@cse.iitk.ac.in>
Cc: David Teigland <teigland@redhat.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/configfs/item.c       | 18 ++++++++----------
 fs/dlm/config.c          |  2 +-
 include/linux/configfs.h |  7 ++-----
 3 files changed, 11 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index b762bbeaa0be..76dc4c3e5d51 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -183,27 +183,25 @@ void config_group_init(struct config_group *group)
 	INIT_LIST_HEAD(&group->cg_children);
 }
 
-
 /**
- *	config_group_find_obj - search for item in group.
+ *	config_group_find_item - search for item in group.
  *	@group:	group we're looking in.
  *	@name:	item's name.
  *
- *	Lock group via @group->cg_subsys, and iterate over @group->cg_list,
- *	looking for a matching config_item. If matching item is found
- *	take a reference and return the item.
+ *	Iterate over @group->cg_list, looking for a matching config_item.
+ *	If matching item is found take a reference and return the item.
+ *	Caller must have locked group via @group->cg_subsys->su_mtx.
  */
-struct config_item *config_group_find_obj(struct config_group *group,
-					  const char * name)
+struct config_item *config_group_find_item(struct config_group *group,
+					   const char *name)
 {
 	struct list_head * entry;
 	struct config_item * ret = NULL;
 
-        /* XXX LOCKING! */
 	list_for_each(entry,&group->cg_children) {
 		struct config_item * item = to_item(entry);
 		if (config_item_name(item) &&
-                    !strcmp(config_item_name(item), name)) {
+		    !strcmp(config_item_name(item), name)) {
 			ret = config_item_get(item);
 			break;
 		}
@@ -215,4 +213,4 @@ EXPORT_SYMBOL(config_item_init);
 EXPORT_SYMBOL(config_group_init);
 EXPORT_SYMBOL(config_item_get);
 EXPORT_SYMBOL(config_item_put);
-EXPORT_SYMBOL(config_group_find_obj);
+EXPORT_SYMBOL(config_group_find_item);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index e47eb42406fa..4348cb42cf17 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -752,7 +752,7 @@ static struct space *get_space(char *name)
 		return NULL;
 
 	down(&space_list->cg_subsys->su_sem);
-	i = config_group_find_obj(space_list, name);
+	i = config_group_find_item(space_list, name);
 	up(&space_list->cg_subsys->su_sem);
 
 	return to_space(i);
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index def7c83d43a2..bbb1b6cafa8b 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -86,12 +86,10 @@ struct config_item_type {
 	struct configfs_attribute		**ct_attrs;
 };
 
-
 /**
  *	group - a group of config_items of a specific type, belonging
  *	to a specific subsystem.
  */
-
 struct config_group {
 	struct config_item		cg_item;
 	struct list_head		cg_children;
@@ -99,13 +97,11 @@ struct config_group {
 	struct config_group		**default_groups;
 };
 
-
 extern void config_group_init(struct config_group *group);
 extern void config_group_init_type_name(struct config_group *group,
 					const char *name,
 					struct config_item_type *type);
 
-
 static inline struct config_group *to_config_group(struct config_item *item)
 {
 	return item ? container_of(item,struct config_group,cg_item) : NULL;
@@ -121,7 +117,8 @@ static inline void config_group_put(struct config_group *group)
 	config_item_put(&group->cg_item);
 }
 
-extern struct config_item *config_group_find_obj(struct config_group *, const char *);
+extern struct config_item *config_group_find_item(struct config_group *,
+						  const char *);
 
 
 struct configfs_attribute {
-- 
cgit v1.3-8-gc7d7


From e6bd07aee739566803425acdbf5cdb29919164e1 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 6 Jul 2007 23:33:17 -0700
Subject: configfs: Convert subsystem semaphore to mutex

Convert the su_sem member of struct configfs_subsystem to a struct
mutex, as that's what it is. Also convert all the users and update
Documentation/configfs.txt and Documentation/configfs_example.c
accordingly.

[ Conflict in fs/dlm/config.c with commit
  3168b0780d06ace875696f8a648d04d6089654e5 manually resolved. --Mark ]

Inspired-by: Satyam Sharma <ssatyam@cse.iitk.ac.in>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 Documentation/filesystems/configfs/configfs.txt       | 18 +++++++++---------
 Documentation/filesystems/configfs/configfs_example.c |  2 +-
 fs/configfs/dir.c                                     | 16 ++++++++--------
 fs/dlm/config.c                                       | 10 +++++-----
 fs/ocfs2/cluster/nodemanager.c                        |  2 +-
 include/linux/configfs.h                              |  4 ++--
 6 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt
index b34cdb50eab4..21f038e66724 100644
--- a/Documentation/filesystems/configfs/configfs.txt
+++ b/Documentation/filesystems/configfs/configfs.txt
@@ -280,18 +280,18 @@ tells configfs to make the subsystem appear in the file tree.
 
 	struct configfs_subsystem {
 		struct config_group	su_group;
-		struct semaphore	su_sem;
+		struct mutex		su_mutex;
 	};
 
 	int configfs_register_subsystem(struct configfs_subsystem *subsys);
 	void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
 
-	A subsystem consists of a toplevel config_group and a semaphore.
+	A subsystem consists of a toplevel config_group and a mutex.
 The group is where child config_items are created.  For a subsystem,
 this group is usually defined statically.  Before calling
 configfs_register_subsystem(), the subsystem must have initialized the
 group via the usual group _init() functions, and it must also have
-initialized the semaphore.
+initialized the mutex.
 	When the register call returns, the subsystem is live, and it
 will be visible via configfs.  At that point, mkdir(2) can be called and
 the subsystem must be ready for it.
@@ -303,7 +303,7 @@ subsystem/group and the simple_child item in configfs_example.c  It
 shows a trivial object displaying and storing an attribute, and a simple
 group creating and destroying these children.
 
-[Hierarchy Navigation and the Subsystem Semaphore]
+[Hierarchy Navigation and the Subsystem Mutex]
 
 There is an extra bonus that configfs provides.  The config_groups and
 config_items are arranged in a hierarchy due to the fact that they
@@ -314,19 +314,19 @@ and config_item->ci_parent structure members.
 
 A subsystem can navigate the cg_children list and the ci_parent pointer
 to see the tree created by the subsystem.  This can race with configfs'
-management of the hierarchy, so configfs uses the subsystem semaphore to
+management of the hierarchy, so configfs uses the subsystem mutex to
 protect modifications.  Whenever a subsystem wants to navigate the
 hierarchy, it must do so under the protection of the subsystem
-semaphore.
+mutex.
 
-A subsystem will be prevented from acquiring the semaphore while a newly
+A subsystem will be prevented from acquiring the mutex while a newly
 allocated item has not been linked into this hierarchy.   Similarly, it
-will not be able to acquire the semaphore while a dropping item has not
+will not be able to acquire the mutex while a dropping item has not
 yet been unlinked.  This means that an item's ci_parent pointer will
 never be NULL while the item is in configfs, and that an item will only
 be in its parent's cg_children list for the same duration.  This allows
 a subsystem to trust ci_parent and cg_children while they hold the
-semaphore.
+mutex.
 
 [Item Aggregation Via symlink(2)]
 
diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c
index 2d6a14a463e0..e56d49264b39 100644
--- a/Documentation/filesystems/configfs/configfs_example.c
+++ b/Documentation/filesystems/configfs/configfs_example.c
@@ -453,7 +453,7 @@ static int __init configfs_example_init(void)
 		subsys = example_subsys[i];
 
 		config_group_init(&subsys->su_group);
-		init_MUTEX(&subsys->su_sem);
+		mutex_init(&subsys->su_mutex);
 		ret = configfs_register_subsystem(subsys);
 		if (ret) {
 			printk(KERN_ERR "Error %d while registering subsystem %s\n",
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 5e6e37e58f36..d3b1dbb9b5b8 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -562,7 +562,7 @@ static int populate_groups(struct config_group *group)
 
 /*
  * All of link_obj/unlink_obj/link_group/unlink_group require that
- * subsys->su_sem is held.
+ * subsys->su_mutex is held.
  */
 
 static void unlink_obj(struct config_item *item)
@@ -783,7 +783,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
 
-	down(&subsys->su_sem);
+	mutex_lock(&subsys->su_mutex);
 	group = NULL;
 	item = NULL;
 	if (type->ct_group_ops->make_group) {
@@ -797,7 +797,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		if (item)
 			link_obj(parent_item, item);
 	}
-	up(&subsys->su_sem);
+	mutex_unlock(&subsys->su_mutex);
 
 	kfree(name);
 	if (!item) {
@@ -841,13 +841,13 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 out_unlink:
 	if (ret) {
 		/* Tear down everything we built up */
-		down(&subsys->su_sem);
+		mutex_lock(&subsys->su_mutex);
 		if (group)
 			unlink_group(group);
 		else
 			unlink_obj(item);
 		client_drop_item(parent_item, item);
-		up(&subsys->su_sem);
+		mutex_unlock(&subsys->su_mutex);
 
 		if (module_got)
 			module_put(owner);
@@ -910,17 +910,17 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (sd->s_type & CONFIGFS_USET_DIR) {
 		configfs_detach_group(item);
 
-		down(&subsys->su_sem);
+		mutex_lock(&subsys->su_mutex);
 		unlink_group(to_config_group(item));
 	} else {
 		configfs_detach_item(item);
 
-		down(&subsys->su_sem);
+		mutex_lock(&subsys->su_mutex);
 		unlink_obj(item);
 	}
 
 	client_drop_item(parent_item, item);
-	up(&subsys->su_sem);
+	mutex_unlock(&subsys->su_mutex);
 
 	/* Drop our reference from above */
 	config_item_put(item);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 4348cb42cf17..2f8e3c81bc19 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -607,7 +607,7 @@ static struct clusters clusters_root = {
 int dlm_config_init(void)
 {
 	config_group_init(&clusters_root.subsys.su_group);
-	init_MUTEX(&clusters_root.subsys.su_sem);
+	mutex_init(&clusters_root.subsys.su_mutex);
 	return configfs_register_subsystem(&clusters_root.subsys);
 }
 
@@ -751,9 +751,9 @@ static struct space *get_space(char *name)
 	if (!space_list)
 		return NULL;
 
-	down(&space_list->cg_subsys->su_sem);
+	mutex_lock(&space_list->cg_subsys->su_mutex);
 	i = config_group_find_item(space_list, name);
-	up(&space_list->cg_subsys->su_sem);
+	mutex_unlock(&space_list->cg_subsys->su_mutex);
 
 	return to_space(i);
 }
@@ -772,7 +772,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
 	if (!comm_list)
 		return NULL;
 
-	down(&clusters_root.subsys.su_sem);
+	mutex_lock(&clusters_root.subsys.su_mutex);
 
 	list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
 		cm = to_comm(i);
@@ -792,7 +792,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
 			break;
 		}
 	}
-	up(&clusters_root.subsys.su_sem);
+	mutex_unlock(&clusters_root.subsys.su_mutex);
 
 	if (!found)
 		cm = NULL;
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 9f5ad0f01ce0..48b77d113cb2 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -934,7 +934,7 @@ static int __init init_o2nm(void)
 		goto out_sysctl;
 
 	config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
-	init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem);
+	mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex);
 	ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
 	if (ret) {
 		printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index bbb1b6cafa8b..5ce0fc4e3b5b 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -40,9 +40,9 @@
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/kref.h>
+#include <linux/mutex.h>
 
 #include <asm/atomic.h>
-#include <asm/semaphore.h>
 
 #define CONFIGFS_ITEM_NAME_LEN	20
 
@@ -174,7 +174,7 @@ struct configfs_group_operations {
 
 struct configfs_subsystem {
 	struct config_group	su_group;
-	struct semaphore	su_sem;
+	struct mutex		su_mutex;
 };
 
 static inline struct configfs_subsystem *to_configfs_subsystem(struct config_group *group)
-- 
cgit v1.3-8-gc7d7


From 299894cc9001b09e3e9685f2709b49e7e1092ccc Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 6 Oct 2006 17:33:23 -0700
Subject: configfs: accessing item hierarchy during rmdir(2)

Add a notification callback, ops->disconnect_notify(). It has the same
prototype as ->drop_item(), but it will be called just before the item
linkage is broken. This way, configfs users who want to do work while
the object is still in the heirarchy have a chance.

Client drivers will still need to config_item_put() in their
->drop_item(), if they implement it.  They need do nothing in
->disconnect_notify().  They don't have to provide it if they don't
care.  But someone who wants to be notified before ci_parent is set to
NULL can now be notified.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 Documentation/filesystems/configfs/configfs.txt | 12 ++++++++++
 fs/configfs/dir.c                               | 29 ++++++++++++++++++++++++-
 include/linux/configfs.h                        |  1 +
 3 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt
index 21f038e66724..aef74cdecc21 100644
--- a/Documentation/filesystems/configfs/configfs.txt
+++ b/Documentation/filesystems/configfs/configfs.txt
@@ -238,6 +238,8 @@ config_item_type.
 		struct config_group *(*make_group)(struct config_group *group,
 						   const char *name);
 		int (*commit_item)(struct config_item *item);
+		void (*disconnect_notify)(struct config_group *group,
+					  struct config_item *item);
 		void (*drop_item)(struct config_group *group,
 				  struct config_item *item);
 	};
@@ -268,6 +270,16 @@ the item in other threads, the memory is safe.  It may take some time
 for the item to actually disappear from the subsystem's usage.  But it
 is gone from configfs.
 
+When drop_item() is called, the item's linkage has already been torn
+down.  It no longer has a reference on its parent and has no place in
+the item hierarchy.  If a client needs to do some cleanup before this
+teardown happens, the subsystem can implement the
+ct_group_ops->disconnect_notify() method.  The method is called after
+configfs has removed the item from the filesystem view but before the
+item is removed from its parent group.  Like drop_item(),
+disconnect_notify() is void and cannot fail.  Client subsystems should
+not drop any references here, as they still must do it in drop_item().
+
 A config_group cannot be removed while it still has child items.  This
 is implemented in the configfs rmdir(2) code.  ->drop_item() will not be
 called, as the item has not been dropped.  rmdir(2) will fail, as the
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index d3b1dbb9b5b8..125954723eb7 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -713,6 +713,28 @@ static void configfs_detach_group(struct config_item *item)
 	configfs_detach_item(item);
 }
 
+/*
+ * After the item has been detached from the filesystem view, we are
+ * ready to tear it out of the hierarchy.  Notify the client before
+ * we do that so they can perform any cleanup that requires
+ * navigating the hierarchy.  A client does not need to provide this
+ * callback.  The subsystem semaphore MUST be held by the caller, and
+ * references must be valid for both items.  It also assumes the
+ * caller has validated ci_type.
+ */
+static void client_disconnect_notify(struct config_item *parent_item,
+				     struct config_item *item)
+{
+	struct config_item_type *type;
+
+	type = parent_item->ci_type;
+	BUG_ON(!type);
+
+	if (type->ct_group_ops && type->ct_group_ops->disconnect_notify)
+		type->ct_group_ops->disconnect_notify(to_config_group(parent_item),
+						      item);
+}
+
 /*
  * Drop the initial reference from make_item()/make_group()
  * This function assumes that reference is held on item
@@ -733,7 +755,7 @@ static void client_drop_item(struct config_item *parent_item,
 	 */
 	if (type->ct_group_ops && type->ct_group_ops->drop_item)
 		type->ct_group_ops->drop_item(to_config_group(parent_item),
-						item);
+					      item);
 	else
 		config_item_put(item);
 }
@@ -842,11 +864,14 @@ out_unlink:
 	if (ret) {
 		/* Tear down everything we built up */
 		mutex_lock(&subsys->su_mutex);
+
+		client_disconnect_notify(parent_item, item);
 		if (group)
 			unlink_group(group);
 		else
 			unlink_obj(item);
 		client_drop_item(parent_item, item);
+
 		mutex_unlock(&subsys->su_mutex);
 
 		if (module_got)
@@ -911,11 +936,13 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 		configfs_detach_group(item);
 
 		mutex_lock(&subsys->su_mutex);
+		client_disconnect_notify(parent_item, item);
 		unlink_group(to_config_group(item));
 	} else {
 		configfs_detach_item(item);
 
 		mutex_lock(&subsys->su_mutex);
+		client_disconnect_notify(parent_item, item);
 		unlink_obj(item);
 	}
 
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 5ce0fc4e3b5b..8227e730dac7 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -169,6 +169,7 @@ struct configfs_group_operations {
 	struct config_item *(*make_item)(struct config_group *group, const char *name);
 	struct config_group *(*make_group)(struct config_group *group, const char *name);
 	int (*commit_item)(struct config_item *item);
+	void (*disconnect_notify)(struct config_group *group, struct config_item *item);
 	void (*drop_item)(struct config_group *group, struct config_item *item);
 };
 
-- 
cgit v1.3-8-gc7d7


From 631d1febab8e546e3bb800bdfe2c212b8adf87de Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 18 Jun 2007 18:06:09 -0700
Subject: configfs: config item dependancies.

Sometimes other drivers depend on particular configfs items.  For
example, ocfs2 mounts depend on a heartbeat region item.  If that
region item is removed with rmdir(2), the ocfs2 mount must BUG or go
readonly.  Not happy.

This provides two additional API calls: configfs_depend_item() and
configfs_undepend_item().  A client driver can call
configfs_depend_item() on an existing item to tell configfs that it is
depended on.  configfs will then return -EBUSY from rmdir(2) for that
item.  When the item is no longer depended on, the client driver calls
configfs_undepend_item() on it.

These API cannot be called underneath any configfs callbacks, as
they will conflict.  They can block and allocate.  A client driver
probably shouldn't calling them of its own gumption.  Rather it should
be providing an API that external subsystems call.

How does this work?  Imagine the ocfs2 mount process.  When it mounts,
it asks for a heart region item.  This is done via a call into the
heartbeat code.  Inside the heartbeat code, the region item is looked
up.  Here, the heartbeat code calls configfs_depend_item().  If it
succeeds, then heartbeat knows the region is safe to give to ocfs2.
If it fails, it was being torn down anyway, and heartbeat can gracefully
pass up an error.

[ Fixed some bad whitespace in configfs.txt. --Mark ]

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 Documentation/filesystems/configfs/configfs.txt |  27 +++
 fs/configfs/configfs_internal.h                 |   7 +-
 fs/configfs/dir.c                               | 244 ++++++++++++++++++++++++
 include/linux/configfs.h                        |   5 +
 4 files changed, 280 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt
index aef74cdecc21..d1b98257d000 100644
--- a/Documentation/filesystems/configfs/configfs.txt
+++ b/Documentation/filesystems/configfs/configfs.txt
@@ -398,6 +398,33 @@ As a consequence of this, default_groups cannot be removed directly via
 rmdir(2).  They also are not considered when rmdir(2) on the parent
 group is checking for children.
 
+[Dependant Subsystems]
+
+Sometimes other drivers depend on particular configfs items.  For
+example, ocfs2 mounts depend on a heartbeat region item.  If that
+region item is removed with rmdir(2), the ocfs2 mount must BUG or go
+readonly.  Not happy.
+
+configfs provides two additional API calls: configfs_depend_item() and
+configfs_undepend_item().  A client driver can call
+configfs_depend_item() on an existing item to tell configfs that it is
+depended on.  configfs will then return -EBUSY from rmdir(2) for that
+item.  When the item is no longer depended on, the client driver calls
+configfs_undepend_item() on it.
+
+These API cannot be called underneath any configfs callbacks, as
+they will conflict.  They can block and allocate.  A client driver
+probably shouldn't calling them of its own gumption.  Rather it should
+be providing an API that external subsystems call.
+
+How does this work?  Imagine the ocfs2 mount process.  When it mounts,
+it asks for a heartbeat region item.  This is done via a call into the
+heartbeat code.  Inside the heartbeat code, the region item is looked
+up.  Here, the heartbeat code calls configfs_depend_item().  If it
+succeeds, then heartbeat knows the region is safe to give to ocfs2.
+If it fails, it was being torn down anyway, and heartbeat can gracefully
+pass up an error.
+
 [Committable Items]
 
 NOTE: Committable items are currently unimplemented.
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 7b48c034b312..3b0185fdf9a4 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -29,10 +29,11 @@
 
 struct configfs_dirent {
 	atomic_t		s_count;
+	int			s_dependent_count;
 	struct list_head	s_sibling;
 	struct list_head	s_children;
 	struct list_head	s_links;
-	void 			* s_element;
+	void			* s_element;
 	int			s_type;
 	umode_t			s_mode;
 	struct dentry		* s_dentry;
@@ -41,8 +42,8 @@ struct configfs_dirent {
 
 #define CONFIGFS_ROOT		0x0001
 #define CONFIGFS_DIR		0x0002
-#define CONFIGFS_ITEM_ATTR 	0x0004
-#define CONFIGFS_ITEM_LINK 	0x0020
+#define CONFIGFS_ITEM_ATTR	0x0004
+#define CONFIGFS_ITEM_LINK	0x0020
 #define CONFIGFS_USET_DIR	0x0040
 #define CONFIGFS_USET_DEFAULT	0x0080
 #define CONFIGFS_USET_DROPPING	0x0100
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 125954723eb7..2f436d4f1d6d 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -355,6 +355,10 @@ static int configfs_detach_prep(struct dentry *dentry)
 			/* Mark that we've taken i_mutex */
 			sd->s_type |= CONFIGFS_USET_DROPPING;
 
+			/*
+			 * Yup, recursive.  If there's a problem, blame
+			 * deep nesting of default_groups
+			 */
 			ret = configfs_detach_prep(sd->s_dentry);
 			if (!ret)
 				continue;
@@ -760,6 +764,239 @@ static void client_drop_item(struct config_item *parent_item,
 		config_item_put(item);
 }
 
+#ifdef DEBUG
+static void configfs_dump_one(struct configfs_dirent *sd, int level)
+{
+	printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd));
+
+#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type);
+	type_print(CONFIGFS_ROOT);
+	type_print(CONFIGFS_DIR);
+	type_print(CONFIGFS_ITEM_ATTR);
+	type_print(CONFIGFS_ITEM_LINK);
+	type_print(CONFIGFS_USET_DIR);
+	type_print(CONFIGFS_USET_DEFAULT);
+	type_print(CONFIGFS_USET_DROPPING);
+#undef type_print
+}
+
+static int configfs_dump(struct configfs_dirent *sd, int level)
+{
+	struct configfs_dirent *child_sd;
+	int ret = 0;
+
+	configfs_dump_one(sd, level);
+
+	if (!(sd->s_type & (CONFIGFS_DIR|CONFIGFS_ROOT)))
+		return 0;
+
+	list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
+		ret = configfs_dump(child_sd, level + 2);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+#endif
+
+
+/*
+ * configfs_depend_item() and configfs_undepend_item()
+ *
+ * WARNING: Do not call these from a configfs callback!
+ *
+ * This describes these functions and their helpers.
+ *
+ * Allow another kernel system to depend on a config_item.  If this
+ * happens, the item cannot go away until the dependant can live without
+ * it.  The idea is to give client modules as simple an interface as
+ * possible.  When a system asks them to depend on an item, they just
+ * call configfs_depend_item().  If the item is live and the client
+ * driver is in good shape, we'll happily do the work for them.
+ *
+ * Why is the locking complex?  Because configfs uses the VFS to handle
+ * all locking, but this function is called outside the normal
+ * VFS->configfs path.  So it must take VFS locks to prevent the
+ * VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc).  This is
+ * why you can't call these functions underneath configfs callbacks.
+ *
+ * Note, btw, that this can be called at *any* time, even when a configfs
+ * subsystem isn't registered, or when configfs is loading or unloading.
+ * Just like configfs_register_subsystem().  So we take the same
+ * precautions.  We pin the filesystem.  We lock each i_mutex _in_order_
+ * on our way down the tree.  If we can find the target item in the
+ * configfs tree, it must be part of the subsystem tree as well, so we
+ * do not need the subsystem semaphore.  Holding the i_mutex chain locks
+ * out mkdir() and rmdir(), who might be racing us.
+ */
+
+/*
+ * configfs_depend_prep()
+ *
+ * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
+ * attributes.  This is similar but not the same to configfs_detach_prep().
+ * Note that configfs_detach_prep() expects the parent to be locked when it
+ * is called, but we lock the parent *inside* configfs_depend_prep().  We
+ * do that so we can unlock it if we find nothing.
+ *
+ * Here we do a depth-first search of the dentry hierarchy looking for
+ * our object.  We take i_mutex on each step of the way down.  IT IS
+ * ESSENTIAL THAT i_mutex LOCKING IS ORDERED.  If we come back up a branch,
+ * we'll drop the i_mutex.
+ *
+ * If the target is not found, -ENOENT is bubbled up and we have released
+ * all locks.  If the target was found, the locks will be cleared by
+ * configfs_depend_rollback().
+ *
+ * This adds a requirement that all config_items be unique!
+ *
+ * This is recursive because the locking traversal is tricky.  There isn't
+ * much on the stack, though, so folks that need this function - be careful
+ * about your stack!  Patches will be accepted to make it iterative.
+ */
+static int configfs_depend_prep(struct dentry *origin,
+				struct config_item *target)
+{
+	struct configfs_dirent *child_sd, *sd = origin->d_fsdata;
+	int ret = 0;
+
+	BUG_ON(!origin || !sd);
+
+	/* Lock this guy on the way down */
+	mutex_lock(&sd->s_dentry->d_inode->i_mutex);
+	if (sd->s_element == target)  /* Boo-yah */
+		goto out;
+
+	list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
+		if (child_sd->s_type & CONFIGFS_DIR) {
+			ret = configfs_depend_prep(child_sd->s_dentry,
+						   target);
+			if (!ret)
+				goto out;  /* Child path boo-yah */
+		}
+	}
+
+	/* We looped all our children and didn't find target */
+	mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
+	ret = -ENOENT;
+
+out:
+	return ret;
+}
+
+/*
+ * This is ONLY called if configfs_depend_prep() did its job.  So we can
+ * trust the entire path from item back up to origin.
+ *
+ * We walk backwards from item, unlocking each i_mutex.  We finish by
+ * unlocking origin.
+ */
+static void configfs_depend_rollback(struct dentry *origin,
+				     struct config_item *item)
+{
+	struct dentry *dentry = item->ci_dentry;
+
+	while (dentry != origin) {
+		mutex_unlock(&dentry->d_inode->i_mutex);
+		dentry = dentry->d_parent;
+	}
+
+	mutex_unlock(&origin->d_inode->i_mutex);
+}
+
+int configfs_depend_item(struct configfs_subsystem *subsys,
+			 struct config_item *target)
+{
+	int ret;
+	struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;
+	struct config_item *s_item = &subsys->su_group.cg_item;
+
+	/*
+	 * Pin the configfs filesystem.  This means we can safely access
+	 * the root of the configfs filesystem.
+	 */
+	ret = configfs_pin_fs();
+	if (ret)
+		return ret;
+
+	/*
+	 * Next, lock the root directory.  We're going to check that the
+	 * subsystem is really registered, and so we need to lock out
+	 * configfs_[un]register_subsystem().
+	 */
+	mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
+
+	root_sd = configfs_sb->s_root->d_fsdata;
+
+	list_for_each_entry(p, &root_sd->s_children, s_sibling) {
+		if (p->s_type & CONFIGFS_DIR) {
+			if (p->s_element == s_item) {
+				subsys_sd = p;
+				break;
+			}
+		}
+	}
+
+	if (!subsys_sd) {
+		ret = -ENOENT;
+		goto out_unlock_fs;
+	}
+
+	/* Ok, now we can trust subsys/s_item */
+
+	/* Scan the tree, locking i_mutex recursively, return 0 if found */
+	ret = configfs_depend_prep(subsys_sd->s_dentry, target);
+	if (ret)
+		goto out_unlock_fs;
+
+	/* We hold all i_mutexes from the subsystem down to the target */
+	p = target->ci_dentry->d_fsdata;
+	p->s_dependent_count += 1;
+
+	configfs_depend_rollback(subsys_sd->s_dentry, target);
+
+out_unlock_fs:
+	mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
+
+	/*
+	 * If we succeeded, the fs is pinned via other methods.  If not,
+	 * we're done with it anyway.  So release_fs() is always right.
+	 */
+	configfs_release_fs();
+
+	return ret;
+}
+EXPORT_SYMBOL(configfs_depend_item);
+
+/*
+ * Release the dependent linkage.  This is much simpler than
+ * configfs_depend_item() because we know that that the client driver is
+ * pinned, thus the subsystem is pinned, and therefore configfs is pinned.
+ */
+void configfs_undepend_item(struct configfs_subsystem *subsys,
+			    struct config_item *target)
+{
+	struct configfs_dirent *sd;
+
+	/*
+	 * Since we can trust everything is pinned, we just need i_mutex
+	 * on the item.
+	 */
+	mutex_lock(&target->ci_dentry->d_inode->i_mutex);
+
+	sd = target->ci_dentry->d_fsdata;
+	BUG_ON(sd->s_dependent_count < 1);
+
+	sd->s_dependent_count -= 1;
+
+	/*
+	 * After this unlock, we cannot trust the item to stay alive!
+	 * DO NOT REFERENCE item after this unlock.
+	 */
+	mutex_unlock(&target->ci_dentry->d_inode->i_mutex);
+}
+EXPORT_SYMBOL(configfs_undepend_item);
 
 static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
@@ -906,6 +1143,13 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (sd->s_type & CONFIGFS_USET_DEFAULT)
 		return -EPERM;
 
+	/*
+	 * Here's where we check for dependents.  We're protected by
+	 * i_mutex.
+	 */
+	if (sd->s_dependent_count)
+		return -EBUSY;
+
 	/* Get a working ref until we have the child */
 	parent_item = configfs_get_config_item(dentry->d_parent);
 	subsys = to_config_group(parent_item)->cg_subsys;
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 8227e730dac7..8c6967f3fb11 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -188,6 +188,11 @@ static inline struct configfs_subsystem *to_configfs_subsystem(struct config_gro
 int configfs_register_subsystem(struct configfs_subsystem *subsys);
 void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
 
+/* These functions can sleep and can alloc with GFP_KERNEL */
+/* WARNING: These cannot be called underneath configfs callbacks!! */
+int configfs_depend_item(struct configfs_subsystem *subsys, struct config_item *target);
+void configfs_undepend_item(struct configfs_subsystem *subsys, struct config_item *target);
+
 #endif  /* __KERNEL__ */
 
 #endif /* _CONFIGFS_H_ */
-- 
cgit v1.3-8-gc7d7


From 45a66c1c3ff88e8050dd25e81bafdf79a12a8042 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 9 Jul 2007 11:46:13 -0700
Subject: libata-core: convert to use cancel_rearming_delayed_work()

We should not use cancel_work_sync(delayed_work->work). This works, but not
good. We can use cancel_rearming_delayed_work(), this also simplifies the
code.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/ata/libata-core.c | 44 ++++----------------------------------------
 include/linux/libata.h    |  1 -
 2 files changed, 4 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 5b25311ba885..257fda90e527 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1283,18 +1283,11 @@ static unsigned int ata_id_xfermask(const u16 *id)
 void ata_port_queue_task(struct ata_port *ap, work_func_t fn, void *data,
 			 unsigned long delay)
 {
-	int rc;
-
-	if (ap->pflags & ATA_PFLAG_FLUSH_PORT_TASK)
-		return;
-
 	PREPARE_DELAYED_WORK(&ap->port_task, fn);
 	ap->port_task_data = data;
 
-	rc = queue_delayed_work(ata_wq, &ap->port_task, delay);
-
-	/* rc == 0 means that another user is using port task */
-	WARN_ON(rc == 0);
+	/* may fail if ata_port_flush_task() in progress */
+	queue_delayed_work(ata_wq, &ap->port_task, delay);
 }
 
 /**
@@ -1309,32 +1302,9 @@ void ata_port_queue_task(struct ata_port *ap, work_func_t fn, void *data,
  */
 void ata_port_flush_task(struct ata_port *ap)
 {
-	unsigned long flags;
-
 	DPRINTK("ENTER\n");
 
-	spin_lock_irqsave(ap->lock, flags);
-	ap->pflags |= ATA_PFLAG_FLUSH_PORT_TASK;
-	spin_unlock_irqrestore(ap->lock, flags);
-
-	DPRINTK("flush #1\n");
-	cancel_work_sync(&ap->port_task.work); /* akpm: seems unneeded */
-
-	/*
-	 * At this point, if a task is running, it's guaranteed to see
-	 * the FLUSH flag; thus, it will never queue pio tasks again.
-	 * Cancel and flush.
-	 */
-	if (!cancel_delayed_work(&ap->port_task)) {
-		if (ata_msg_ctl(ap))
-			ata_port_printk(ap, KERN_DEBUG, "%s: flush #2\n",
-					__FUNCTION__);
-		cancel_work_sync(&ap->port_task.work);
-	}
-
-	spin_lock_irqsave(ap->lock, flags);
-	ap->pflags &= ~ATA_PFLAG_FLUSH_PORT_TASK;
-	spin_unlock_irqrestore(ap->lock, flags);
+	cancel_rearming_delayed_work(&ap->port_task);
 
 	if (ata_msg_ctl(ap))
 		ata_port_printk(ap, KERN_DEBUG, "%s: EXIT\n", __FUNCTION__);
@@ -6557,13 +6527,7 @@ void ata_port_detach(struct ata_port *ap)
 	spin_unlock_irqrestore(ap->lock, flags);
 
 	ata_port_wait_eh(ap);
-
-	/* Flush hotplug task.  The sequence is similar to
-	 * ata_port_flush_task().
-	 */
-	cancel_work_sync(&ap->hotplug_task.work); /* akpm: why? */
-	cancel_delayed_work(&ap->hotplug_task);
-	cancel_work_sync(&ap->hotplug_task.work);
+	cancel_rearming_delayed_work(&ap->hotplug_task);
 
  skip_eh:
 	/* remove the associated SCSI host */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index a3df64677ac3..bf98d44c8109 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -196,7 +196,6 @@ enum {
 	ATA_PFLAG_SCSI_HOTPLUG	= (1 << 6), /* SCSI hotplug scheduled */
 	ATA_PFLAG_INITIALIZING	= (1 << 7), /* being initialized, don't touch */
 
-	ATA_PFLAG_FLUSH_PORT_TASK = (1 << 16), /* flush port task */
 	ATA_PFLAG_SUSPENDED	= (1 << 17), /* port is suspended (power) */
 	ATA_PFLAG_PM_PENDING	= (1 << 18), /* PM operation pending */
 	ATA_PFLAG_GTM_VALID	= (1 << 19), /* acpi_gtm data valid */
-- 
cgit v1.3-8-gc7d7


From d583bc18812f8da52bf25eef9cd111e5fd46a6ab Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 4 Jul 2007 18:02:07 +0900
Subject: libata: simplify PCI legacy SFF host handling

With PCI resource fix up for legacy hosts.  We can use the same code
path to allocate IO resources and initialize host for both legacy and
native SFF hosts.  Only IRQ requesting needs to be different.

Rename ata_pci_*_native_host() to ata_pci_*_sff_host(), kill all
legacy specific functions and use the renamed functions instead.  This
simplifies code a lot.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/ata/libata-core.c |   4 +-
 drivers/ata/libata-sff.c  | 288 ++++++----------------------------------------
 drivers/ata/sata_nv.c     |   2 +-
 drivers/ata/sata_sis.c    |   2 +-
 drivers/ata/sata_uli.c    |   2 +-
 drivers/ata/sata_via.c    |   2 +-
 include/linux/libata.h    |   8 +-
 7 files changed, 42 insertions(+), 266 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 257fda90e527..681693534f4f 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -6925,9 +6925,9 @@ EXPORT_SYMBOL_GPL(ata_timing_merge);
 
 #ifdef CONFIG_PCI
 EXPORT_SYMBOL_GPL(pci_test_config_bits);
-EXPORT_SYMBOL_GPL(ata_pci_init_native_host);
+EXPORT_SYMBOL_GPL(ata_pci_init_sff_host);
 EXPORT_SYMBOL_GPL(ata_pci_init_bmdma);
-EXPORT_SYMBOL_GPL(ata_pci_prepare_native_host);
+EXPORT_SYMBOL_GPL(ata_pci_prepare_sff_host);
 EXPORT_SYMBOL_GPL(ata_pci_init_one);
 EXPORT_SYMBOL_GPL(ata_pci_remove_one);
 #ifdef CONFIG_PM
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index fa1c22c7b38f..ca7d2245d684 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -604,13 +604,17 @@ int ata_pci_init_bmdma(struct ata_host *host)
 }
 
 /**
- *	ata_pci_init_native_host - acquire native ATA resources and init host
+ *	ata_pci_init_sff_host - acquire native PCI ATA resources and init host
  *	@host: target ATA host
  *
  *	Acquire native PCI ATA resources for @host and initialize the
  *	first two ports of @host accordingly.  Ports marked dummy are
  *	skipped and allocation failure makes the port dummy.
  *
+ *	Note that native PCI resources are valid even for legacy hosts
+ *	as we fix up pdev resources array early in boot, so this
+ *	function can be used for both native and legacy SFF hosts.
+ *
  *	LOCKING:
  *	Inherited from calling layer (may sleep).
  *
@@ -618,7 +622,7 @@ int ata_pci_init_bmdma(struct ata_host *host)
  *	0 if at least one port is initialized, -ENODEV if no port is
  *	available.
  */
-int ata_pci_init_native_host(struct ata_host *host)
+int ata_pci_init_sff_host(struct ata_host *host)
 {
 	struct device *gdev = host->dev;
 	struct pci_dev *pdev = to_pci_dev(gdev);
@@ -673,7 +677,7 @@ int ata_pci_init_native_host(struct ata_host *host)
 }
 
 /**
- *	ata_pci_prepare_native_host - helper to prepare native PCI ATA host
+ *	ata_pci_prepare_sff_host - helper to prepare native PCI ATA host
  *	@pdev: target PCI device
  *	@ppi: array of port_info, must be enough for two ports
  *	@r_host: out argument for the initialized ATA host
@@ -687,9 +691,9 @@ int ata_pci_init_native_host(struct ata_host *host)
  *	RETURNS:
  *	0 on success, -errno otherwise.
  */
-int ata_pci_prepare_native_host(struct pci_dev *pdev,
-				const struct ata_port_info * const * ppi,
-				struct ata_host **r_host)
+int ata_pci_prepare_sff_host(struct pci_dev *pdev,
+			     const struct ata_port_info * const * ppi,
+			     struct ata_host **r_host)
 {
 	struct ata_host *host;
 	int rc;
@@ -705,7 +709,7 @@ int ata_pci_prepare_native_host(struct pci_dev *pdev,
 		goto err_out;
 	}
 
-	rc = ata_pci_init_native_host(host);
+	rc = ata_pci_init_sff_host(host);
 	if (rc)
 		goto err_out;
 
@@ -730,221 +734,6 @@ int ata_pci_prepare_native_host(struct pci_dev *pdev,
 	return rc;
 }
 
-struct ata_legacy_devres {
-	unsigned int	mask;
-	unsigned long	cmd_port[2];
-	void __iomem *	cmd_addr[2];
-	void __iomem *	ctl_addr[2];
-	unsigned int	irq[2];
-	void *		irq_dev_id[2];
-};
-
-static void ata_legacy_free_irqs(struct ata_legacy_devres *legacy_dr)
-{
-	int i;
-
-	for (i = 0; i < 2; i++) {
-		if (!legacy_dr->irq[i])
-			continue;
-
-		free_irq(legacy_dr->irq[i], legacy_dr->irq_dev_id[i]);
-		legacy_dr->irq[i] = 0;
-		legacy_dr->irq_dev_id[i] = NULL;
-	}
-}
-
-static void ata_legacy_release(struct device *gdev, void *res)
-{
-	struct ata_legacy_devres *this = res;
-	int i;
-
-	ata_legacy_free_irqs(this);
-
-	for (i = 0; i < 2; i++) {
-		if (this->cmd_addr[i])
-			ioport_unmap(this->cmd_addr[i]);
-		if (this->ctl_addr[i])
-			ioport_unmap(this->ctl_addr[i]);
-		if (this->cmd_port[i])
-			release_region(this->cmd_port[i], 8);
-	}
-}
-
-static int ata_init_legacy_port(struct ata_port *ap,
-				struct ata_legacy_devres *legacy_dr)
-{
-	struct ata_host *host = ap->host;
-	int port_no = ap->port_no;
-	unsigned long cmd_port, ctl_port;
-
-	if (port_no == 0) {
-		cmd_port = ATA_PRIMARY_CMD;
-		ctl_port = ATA_PRIMARY_CTL;
-	} else {
-		cmd_port = ATA_SECONDARY_CMD;
-		ctl_port = ATA_SECONDARY_CTL;
-	}
-
-	/* request cmd_port */
-	if (request_region(cmd_port, 8, "libata"))
-		legacy_dr->cmd_port[port_no] = cmd_port;
-	else {
-		dev_printk(KERN_WARNING, host->dev,
-			   "0x%0lX IDE port busy\n", cmd_port);
-		return -EBUSY;
-	}
-
-	/* iomap cmd and ctl ports */
-	legacy_dr->cmd_addr[port_no] = ioport_map(cmd_port, 8);
-	legacy_dr->ctl_addr[port_no] = ioport_map(ctl_port, 1);
-	if (!legacy_dr->cmd_addr[port_no] || !legacy_dr->ctl_addr[port_no]) {
-		dev_printk(KERN_WARNING, host->dev,
-			   "failed to map cmd/ctl ports\n");
-		return -ENOMEM;
-	}
-
-	/* init IO addresses */
-	ap->ioaddr.cmd_addr = legacy_dr->cmd_addr[port_no];
-	ap->ioaddr.altstatus_addr = legacy_dr->ctl_addr[port_no];
-	ap->ioaddr.ctl_addr = legacy_dr->ctl_addr[port_no];
-	ata_std_ports(&ap->ioaddr);
-
-	return 0;
-}
-
-/**
- *	ata_init_legacy_host - acquire legacy ATA resources and init ATA host
- *	@host: target ATA host
- *	@was_busy: out parameter, indicates whether any port was busy
- *
- *	Acquire legacy ATA resources for the first two ports of @host
- *	and initialize it accordingly.  Ports marked dummy are skipped
- *	and resource acquistion failure makes the port dummy.
- *
- *	LOCKING:
- *	Inherited from calling layer (may sleep).
- *
- *	RETURNS:
- *	0 if at least one port is initialized, -ENODEV if no port is
- *	available.
- */
-static int ata_init_legacy_host(struct ata_host *host, int *was_busy)
-{
-	struct device *gdev = host->dev;
-	struct ata_legacy_devres *legacy_dr;
-	int i, rc;
-
-	if (!devres_open_group(gdev, NULL, GFP_KERNEL))
-		return -ENOMEM;
-
-	rc = -ENOMEM;
-	legacy_dr = devres_alloc(ata_legacy_release, sizeof(*legacy_dr),
-				 GFP_KERNEL);
-	if (!legacy_dr)
-		goto err_out;
-	devres_add(gdev, legacy_dr);
-
-	for (i = 0; i < 2; i++) {
-		if (ata_port_is_dummy(host->ports[i]))
-			continue;
-
-		rc = ata_init_legacy_port(host->ports[i], legacy_dr);
-		if (rc == 0)
-			legacy_dr->mask |= 1 << i;
-		else {
-			if (rc == -EBUSY)
-				(*was_busy)++;
-			host->ports[i]->ops = &ata_dummy_port_ops;
-		}
-	}
-
-	if (!legacy_dr->mask) {
-		dev_printk(KERN_ERR, gdev, "no available legacy port\n");
-		return -ENODEV;
-	}
-
-	devres_remove_group(gdev, NULL);
-	return 0;
-
- err_out:
-	devres_release_group(gdev, NULL);
-	return rc;
-}
-
-/**
- *	ata_request_legacy_irqs - request legacy ATA IRQs
- *	@host: target ATA host
- *	@handler: array of IRQ handlers
- *	@irq_flags: array of IRQ flags
- *	@dev_id: array of IRQ dev_ids
- *
- *	Request legacy IRQs for non-dummy legacy ports in @host.  All
- *	IRQ parameters are passed as array to allow ports to have
- *	separate IRQ handlers.
- *
- *	LOCKING:
- *	Inherited from calling layer (may sleep).
- *
- *	RETURNS:
- *	0 on success, -errno otherwise.
- */
-static int ata_request_legacy_irqs(struct ata_host *host,
-				   irq_handler_t const *handler,
-				   const unsigned int *irq_flags,
-				   void * const *dev_id)
-{
-	struct device *gdev = host->dev;
-	struct ata_legacy_devres *legacy_dr;
-	int i, rc;
-
-	legacy_dr = devres_find(host->dev, ata_legacy_release, NULL, NULL);
-	BUG_ON(!legacy_dr);
-
-	for (i = 0; i < 2; i++) {
-		unsigned int irq;
-
-		/* FIXME: ATA_*_IRQ() should take generic device not pci_dev */
-		if (i == 0)
-			irq = ATA_PRIMARY_IRQ(to_pci_dev(gdev));
-		else
-			irq = ATA_SECONDARY_IRQ(to_pci_dev(gdev));
-
-		if (!(legacy_dr->mask & (1 << i)))
-			continue;
-
-		if (!handler[i]) {
-			dev_printk(KERN_ERR, gdev,
-				   "NULL handler specified for port %d\n", i);
-			rc = -EINVAL;
-			goto err_out;
-		}
-
-		rc = request_irq(irq, handler[i], irq_flags[i], DRV_NAME,
-				 dev_id[i]);
-		if (rc) {
-			dev_printk(KERN_ERR, gdev,
-				"irq %u request failed (errno=%d)\n", irq, rc);
-			goto err_out;
-		}
-
-		/* record irq allocation in legacy_dr */
-		legacy_dr->irq[i] = irq;
-		legacy_dr->irq_dev_id[i] = dev_id[i];
-
-		/* only used to print info */
-		if (i == 0)
-			host->irq = irq;
-		else
-			host->irq2 = irq;
-	}
-
-	return 0;
-
- err_out:
-	ata_legacy_free_irqs(legacy_dr);
-	return rc;
-}
-
 /**
  *	ata_pci_init_one - Initialize/register PCI IDE host controller
  *	@pdev: Controller to be initialized
@@ -1029,35 +818,11 @@ int ata_pci_init_one(struct pci_dev *pdev,
 #endif
 	}
 
-	/* alloc and init host */
-	host = ata_host_alloc_pinfo(dev, ppi, 2);
-	if (!host) {
-		dev_printk(KERN_ERR, &pdev->dev,
-			   "failed to allocate ATA host\n");
-		rc = -ENOMEM;
+	/* prepare host */
+	rc = ata_pci_prepare_sff_host(pdev, ppi, &host);
+	if (rc)
 		goto err_out;
-	}
 
-	if (!legacy_mode) {
-		rc = ata_pci_init_native_host(host);
-		if (rc)
-			goto err_out;
-	} else {
-		int was_busy = 0;
-
-		rc = ata_init_legacy_host(host, &was_busy);
-		if (was_busy)
-			pcim_pin_device(pdev);
-		if (rc)
-			goto err_out;
-
-		/* request respective PCI regions, may fail */
-		rc = pci_request_region(pdev, 1, DRV_NAME);
-		rc = pci_request_region(pdev, 3, DRV_NAME);
-	}
-
-	/* init BMDMA, may fail */
-	ata_pci_init_bmdma(host);
 	pci_set_master(pdev);
 
 	/* start host and request IRQ */
@@ -1068,17 +833,28 @@ int ata_pci_init_one(struct pci_dev *pdev,
 	if (!legacy_mode) {
 		rc = devm_request_irq(dev, pdev->irq, pi->port_ops->irq_handler,
 				      IRQF_SHARED, DRV_NAME, host);
+		if (rc)
+			goto err_out;
 		host->irq = pdev->irq;
 	} else {
-		irq_handler_t handler[2] = { host->ops->irq_handler,
-					     host->ops->irq_handler };
-		unsigned int irq_flags[2] = { IRQF_SHARED, IRQF_SHARED };
-		void *dev_id[2] = { host, host };
+		if (!ata_port_is_dummy(host->ports[0])) {
+			host->irq = ATA_PRIMARY_IRQ(pdev);
+			rc = devm_request_irq(dev, host->irq,
+					      pi->port_ops->irq_handler,
+					      IRQF_SHARED, DRV_NAME, host);
+			if (rc)
+				goto err_out;
+		}
 
-		rc = ata_request_legacy_irqs(host, handler, irq_flags, dev_id);
+		if (!ata_port_is_dummy(host->ports[1])) {
+			host->irq2 = ATA_SECONDARY_IRQ(pdev);
+			rc = devm_request_irq(dev, host->irq2,
+					      pi->port_ops->irq_handler,
+					      IRQF_SHARED, DRV_NAME, host);
+			if (rc)
+				goto err_out;
+		}
 	}
-	if (rc)
-		goto err_out;
 
 	/* register */
 	rc = ata_host_register(host, pi->sht);
diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index b2656867c647..db81e3efa5ec 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -1560,7 +1560,7 @@ static int nv_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	}
 
 	ppi[0] = &nv_port_info[type];
-	rc = ata_pci_prepare_native_host(pdev, ppi, &host);
+	rc = ata_pci_prepare_sff_host(pdev, ppi, &host);
 	if (rc)
 		return rc;
 
diff --git a/drivers/ata/sata_sis.c b/drivers/ata/sata_sis.c
index fd80bcf1b236..33716b00c6b7 100644
--- a/drivers/ata/sata_sis.c
+++ b/drivers/ata/sata_sis.c
@@ -334,7 +334,7 @@ static int sis_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 		break;
 	}
 
-	rc = ata_pci_prepare_native_host(pdev, ppi, &host);
+	rc = ata_pci_prepare_sff_host(pdev, ppi, &host);
 	if (rc)
 		return rc;
 
diff --git a/drivers/ata/sata_uli.c b/drivers/ata/sata_uli.c
index aca71819f6e8..b52f83ab056a 100644
--- a/drivers/ata/sata_uli.c
+++ b/drivers/ata/sata_uli.c
@@ -213,7 +213,7 @@ static int uli_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	host->private_data = hpriv;
 
 	/* the first two ports are standard SFF */
-	rc = ata_pci_init_native_host(host);
+	rc = ata_pci_init_sff_host(host);
 	if (rc)
 		return rc;
 
diff --git a/drivers/ata/sata_via.c b/drivers/ata/sata_via.c
index a4c0832033d8..c4124475f754 100644
--- a/drivers/ata/sata_via.c
+++ b/drivers/ata/sata_via.c
@@ -412,7 +412,7 @@ static int vt6420_prepare_host(struct pci_dev *pdev, struct ata_host **r_host)
 	struct ata_host *host;
 	int rc;
 
-	rc = ata_pci_prepare_native_host(pdev, ppi, &host);
+	rc = ata_pci_prepare_sff_host(pdev, ppi, &host);
 	if (rc)
 		return rc;
 	*r_host = host;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index bf98d44c8109..0c8b6578bd59 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -870,11 +870,11 @@ struct pci_bits {
 	unsigned long		val;
 };
 
-extern int ata_pci_init_native_host(struct ata_host *host);
+extern int ata_pci_init_sff_host(struct ata_host *host);
 extern int ata_pci_init_bmdma(struct ata_host *host);
-extern int ata_pci_prepare_native_host(struct pci_dev *pdev,
-				const struct ata_port_info * const * ppi,
-				struct ata_host **r_host);
+extern int ata_pci_prepare_sff_host(struct pci_dev *pdev,
+				    const struct ata_port_info * const * ppi,
+				    struct ata_host **r_host);
 extern int pci_test_config_bits(struct pci_dev *pdev, const struct pci_bits *bits);
 extern unsigned long ata_pci_default_filter(struct ata_device *, unsigned long);
 #endif /* CONFIG_PCI */
-- 
cgit v1.3-8-gc7d7


From 75683fe7153c3817bb4fd4491e2a5913af6c463e Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 5 Jul 2007 13:31:27 +0900
Subject: libata: clean up horkage handling

Horkage handling had the following problems.

* dev->horkage was positioned after ATA_DEVICE_CLEAR_OFFSET, so it was
  cleared before the device is configured.  This broke
  HORKAGE_DIAGNOSTIC.

* Some used dev->horkage while others called ata_device_blacklisted()
  directly.  This was at best confusing.

This patch moves dev->horkage right after dev->flags and set the field
according to the blacklist during device configuration.  All users
test against dev->horkage.  ata_device_blacklisted() now has only one
user, make it static.  While at it, rename it to ata_dev_blacklisted()
for consistency.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/ata/libata-core.c | 13 ++++++++-----
 include/linux/libata.h    |  3 +--
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index a00e4ee58b73..66c8c8c377ab 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -71,6 +71,7 @@ static unsigned int ata_dev_init_params(struct ata_device *dev,
 					u16 heads, u16 sectors);
 static unsigned int ata_dev_set_xfermode(struct ata_device *dev);
 static void ata_dev_xfermask(struct ata_device *dev);
+static unsigned long ata_dev_blacklisted(const struct ata_device *dev);
 
 unsigned int ata_print_id = 1;
 static struct workqueue_struct *ata_wq;
@@ -1784,7 +1785,7 @@ static void ata_dev_config_ncq(struct ata_device *dev,
 		desc[0] = '\0';
 		return;
 	}
-	if (ata_device_blacklisted(dev) & ATA_HORKAGE_NONCQ) {
+	if (dev->horkage & ATA_HORKAGE_NONCQ) {
 		snprintf(desc, desc_sz, "NCQ (not used)");
 		return;
 	}
@@ -1833,6 +1834,9 @@ int ata_dev_configure(struct ata_device *dev)
 	if (ata_msg_probe(ap))
 		ata_dev_printk(dev, KERN_DEBUG, "%s: ENTER\n", __FUNCTION__);
 
+	/* set horkage */
+	dev->horkage |= ata_dev_blacklisted(dev);
+
 	/* let ACPI work its magic */
 	rc = ata_acpi_on_devcfg(dev);
 	if (rc)
@@ -2008,7 +2012,7 @@ int ata_dev_configure(struct ata_device *dev)
 		dev->max_sectors = ATA_MAX_SECTORS;
 	}
 
-	if (ata_device_blacklisted(dev) & ATA_HORKAGE_MAX_SEC_128)
+	if (dev->horkage & ATA_HORKAGE_MAX_SEC_128)
 		dev->max_sectors = min_t(unsigned int, ATA_MAX_SECTORS_128,
 					 dev->max_sectors);
 
@@ -3775,7 +3779,7 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 	{ }
 };
 
-unsigned long ata_device_blacklisted(const struct ata_device *dev)
+static unsigned long ata_dev_blacklisted(const struct ata_device *dev)
 {
 	unsigned char model_num[ATA_ID_PROD_LEN + 1];
 	unsigned char model_rev[ATA_ID_FW_REV_LEN + 1];
@@ -3805,7 +3809,7 @@ static int ata_dma_blacklisted(const struct ata_device *dev)
 	if ((dev->ap->flags & ATA_FLAG_PIO_POLLING) &&
 	    (dev->flags & ATA_DFLAG_CDB_INTR))
 		return 1;
-	return (ata_device_blacklisted(dev) & ATA_HORKAGE_NODMA) ? 1 : 0;
+	return (dev->horkage & ATA_HORKAGE_NODMA) ? 1 : 0;
 }
 
 /**
@@ -6918,7 +6922,6 @@ EXPORT_SYMBOL_GPL(ata_host_resume);
 EXPORT_SYMBOL_GPL(ata_id_string);
 EXPORT_SYMBOL_GPL(ata_id_c_string);
 EXPORT_SYMBOL_GPL(ata_id_to_dma_mode);
-EXPORT_SYMBOL_GPL(ata_device_blacklisted);
 EXPORT_SYMBOL_GPL(ata_scsi_simulate);
 
 EXPORT_SYMBOL_GPL(ata_pio_need_iordy);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0c8b6578bd59..47cd2a1c5544 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -434,6 +434,7 @@ struct ata_device {
 	struct ata_port		*ap;
 	unsigned int		devno;		/* 0 or 1 */
 	unsigned long		flags;		/* ATA_DFLAG_xxx */
+	unsigned int		horkage;	/* List of broken features */
 	struct scsi_device	*sdev;		/* attached SCSI device */
 #ifdef CONFIG_ATA_ACPI
 	acpi_handle		acpi_handle;
@@ -465,7 +466,6 @@ struct ata_device {
 	/* error history */
 	struct ata_ering	ering;
 	int			spdn_cnt;
-	unsigned int		horkage;	/* List of broken features */
 };
 
 /* Offset into struct ata_device.  Fields above it are maintained
@@ -793,7 +793,6 @@ extern void ata_id_string(const u16 *id, unsigned char *s,
 extern void ata_id_c_string(const u16 *id, unsigned char *s,
 			    unsigned int ofs, unsigned int len);
 extern void ata_id_to_dma_mode(struct ata_device *dev, u8 unknown);
-extern unsigned long ata_device_blacklisted(const struct ata_device *dev);
 extern void ata_bmdma_setup (struct ata_queued_cmd *qc);
 extern void ata_bmdma_start (struct ata_queued_cmd *qc);
 extern void ata_bmdma_stop(struct ata_queued_cmd *qc);
-- 
cgit v1.3-8-gc7d7


From 814600ee10d3c056ada315cdbdc2ebe48f54c75a Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 1 Jul 2007 19:05:58 +0900
Subject: libata-link: add PMP related ATA constants

Add Port Multiplier related ATA constants and macros.  Some of these
will be used by ata_link implementation.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 include/linux/ata.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 407dc7e098bc..b5a20162af32 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -164,6 +164,8 @@ enum {
 	ATA_CMD_SET_MAX		= 0xF9,
 	ATA_CMD_SET_MAX_EXT	= 0x37,
 	ATA_CMD_READ_LOG_EXT	= 0x2f,
+	ATA_CMD_PMP_READ	= 0xE4,
+	ATA_CMD_PMP_WRITE	= 0xE8,
 
 	/* READ_LOG_EXT pages */
 	ATA_LOG_SATA_NCQ	= 0x10,
@@ -212,6 +214,28 @@ enum {
 						   0=to device, 1=to host */
 	ATAPI_CDB_LEN		= 16,
 
+	/* PMP stuff */
+	SATA_PMP_MAX_PORTS	= 15,
+	SATA_PMP_CTRL_PORT	= 15,
+
+	SATA_PMP_GSCR_DWORDS	= 128,
+	SATA_PMP_GSCR_PROD_ID	= 0,
+	SATA_PMP_GSCR_REV	= 1,
+	SATA_PMP_GSCR_PORT_INFO	= 2,
+	SATA_PMP_GSCR_ERROR	= 32,
+	SATA_PMP_GSCR_ERROR_EN	= 33,
+	SATA_PMP_GSCR_FEAT	= 64,
+	SATA_PMP_GSCR_FEAT_EN	= 96,
+
+	SATA_PMP_PSCR_STATUS	= 0,
+	SATA_PMP_PSCR_ERROR	= 1,
+	SATA_PMP_PSCR_CONTROL	= 2,
+
+	SATA_PMP_FEAT_BIST	= (1 << 0),
+	SATA_PMP_FEAT_PMREQ	= (1 << 1),
+	SATA_PMP_FEAT_DYNSSC	= (1 << 2),
+	SATA_PMP_FEAT_NOTIFY	= (1 << 3),
+
 	/* cable types */
 	ATA_CBL_NONE		= 0,
 	ATA_CBL_PATA40		= 1,
@@ -418,4 +442,9 @@ static inline int lba_48_ok(u64 block, u32 n_block)
 	return ((block + n_block - 1) < ((u64)1 << 48)) && (n_block <= 65536);
 }
 
+#define sata_pmp_gscr_vendor(gscr)	((gscr)[SATA_PMP_GSCR_PROD_ID] & 0xffff)
+#define sata_pmp_gscr_devid(gscr)	((gscr)[SATA_PMP_GSCR_PROD_ID] >> 16)
+#define sata_pmp_gscr_rev(gscr)		(((gscr)[SATA_PMP_GSCR_REV] >> 8) & 0xff)
+#define sata_pmp_gscr_ports(gscr)	((gscr)[SATA_PMP_GSCR_PORT_INFO] & 0xf)
+
 #endif /* __LINUX_ATA_H__ */
-- 
cgit v1.3-8-gc7d7


From 88be9f990fe70f0f177ef44a16a477599e91f825 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 5 Jun 2007 10:42:27 -0400
Subject: NFS: Replace vfsmount and dentry in nfs_open_context with struct path

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/delegation.c    |  2 +-
 fs/nfs/direct.c        |  4 ++--
 fs/nfs/inode.c         | 10 +++++-----
 fs/nfs/nfs4proc.c      |  6 +++---
 fs/nfs/pagelist.c      |  6 +++---
 fs/nfs/read.c          |  6 +++---
 fs/nfs/write.c         | 20 ++++++++++----------
 include/linux/nfs_fs.h |  4 ++--
 8 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 7f37d1bea83f..b47c156a711d 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -74,7 +74,7 @@ again:
 			continue;
 		get_nfs_open_context(ctx);
 		spin_unlock(&inode->i_lock);
-		err = nfs4_open_delegation_recall(ctx->dentry, state);
+		err = nfs4_open_delegation_recall(ctx->path.dentry, state);
 		if (err >= 0)
 			err = nfs_delegation_claim_locks(ctx, state);
 		put_nfs_open_context(ctx);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f1b153ad645b..a5c82b6f3b45 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -266,7 +266,7 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
 static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
-	struct inode *inode = ctx->dentry->d_inode;
+	struct inode *inode = ctx->path.dentry->d_inode;
 	size_t rsize = NFS_SERVER(inode)->rsize;
 	unsigned int pgbase;
 	int result;
@@ -606,7 +606,7 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
 static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
-	struct inode *inode = ctx->dentry->d_inode;
+	struct inode *inode = ctx->path.dentry->d_inode;
 	size_t wsize = NFS_SERVER(inode)->wsize;
 	unsigned int pgbase;
 	int result;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bd9f5a836592..cc7a9064be90 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -462,8 +462,8 @@ static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, str
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 	if (ctx != NULL) {
 		atomic_set(&ctx->count, 1);
-		ctx->dentry = dget(dentry);
-		ctx->vfsmnt = mntget(mnt);
+		ctx->path.dentry = dget(dentry);
+		ctx->path.mnt = mntget(mnt);
 		ctx->cred = get_rpccred(cred);
 		ctx->state = NULL;
 		ctx->lockowner = current->files;
@@ -484,7 +484,7 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
 {
 	if (atomic_dec_and_test(&ctx->count)) {
 		if (!list_empty(&ctx->list)) {
-			struct inode *inode = ctx->dentry->d_inode;
+			struct inode *inode = ctx->path.dentry->d_inode;
 			spin_lock(&inode->i_lock);
 			list_del(&ctx->list);
 			spin_unlock(&inode->i_lock);
@@ -493,8 +493,8 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
 			nfs4_close_state(ctx->state, ctx->mode);
 		if (ctx->cred != NULL)
 			put_rpccred(ctx->cred);
-		dput(ctx->dentry);
-		mntput(ctx->vfsmnt);
+		dput(ctx->path.dentry);
+		mntput(ctx->path.mnt);
 		kfree(ctx);
 	}
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 648e0ac0f90e..4d641cbdbde1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -512,7 +512,7 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta
 	ctx = nfs4_state_find_open_context(state);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
-	ret = nfs4_do_open_reclaim(sp, state, ctx->dentry);
+	ret = nfs4_do_open_reclaim(sp, state, ctx->path.dentry);
 	put_nfs_open_context(ctx);
 	return ret;
 }
@@ -862,7 +862,7 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
 	ctx = nfs4_state_find_open_context(state);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
-	ret = nfs4_do_open_expired(sp, state, ctx->dentry);
+	ret = nfs4_do_open_expired(sp, state, ctx->path.dentry);
 	put_nfs_open_context(ctx);
 	return ret;
 }
@@ -3285,7 +3285,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 		memcpy(data->lsp->ls_stateid.data, data->res.stateid.data,
 					sizeof(data->lsp->ls_stateid.data));
 		data->lsp->ls_flags |= NFS_LOCK_INITIALIZED;
-		renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp);
+		renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp);
 	}
 	nfs_increment_lock_seqid(data->rpc_status, data->arg.lock_seqid);
 out:
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index c5bb51a29e80..f8a4ba533930 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -114,7 +114,7 @@ void nfs_unlock_request(struct nfs_page *req)
  */
 int nfs_set_page_writeback_locked(struct nfs_page *req)
 {
-	struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode);
+	struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
 
 	if (!nfs_lock_request(req))
 		return 0;
@@ -127,7 +127,7 @@ int nfs_set_page_writeback_locked(struct nfs_page *req)
  */
 void nfs_clear_page_writeback(struct nfs_page *req)
 {
-	struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode);
+	struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
 
 	if (req->wb_page != NULL) {
 		spin_lock(&nfsi->req_lock);
@@ -193,7 +193,7 @@ static int nfs_wait_bit_interruptible(void *word)
 int
 nfs_wait_on_request(struct nfs_page *req)
 {
-        struct rpc_clnt	*clnt = NFS_CLIENT(req->wb_context->dentry->d_inode);
+	struct rpc_clnt *clnt = NFS_CLIENT(req->wb_context->path.dentry->d_inode);
 	sigset_t oldmask;
 	int ret = 0;
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index c07d0d10d9ed..6ae2e58ed05a 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -145,8 +145,8 @@ static void nfs_readpage_release(struct nfs_page *req)
 	unlock_page(req->wb_page);
 
 	dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
-			req->wb_context->dentry->d_inode->i_sb->s_id,
-			(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+			req->wb_context->path.dentry->d_inode->i_sb->s_id,
+			(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
 			req->wb_bytes,
 			(long long)req_offset(req));
 	nfs_clear_request(req);
@@ -164,7 +164,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 	int flags;
 
 	data->req	  = req;
-	data->inode	  = inode = req->wb_context->dentry->d_inode;
+	data->inode	  = inode = req->wb_context->path.dentry->d_inode;
 	data->cred	  = req->wb_context->cred;
 
 	data->args.fh     = NFS_FH(inode);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b853959d964d..9e7c21da864f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -407,7 +407,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
  */
 static void nfs_inode_remove_request(struct nfs_page *req)
 {
-	struct inode *inode = req->wb_context->dentry->d_inode;
+	struct inode *inode = req->wb_context->path.dentry->d_inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	BUG_ON (!NFS_WBACK_BUSY(req));
@@ -455,7 +455,7 @@ nfs_dirty_request(struct nfs_page *req)
 static void
 nfs_mark_request_commit(struct nfs_page *req)
 {
-	struct inode *inode = req->wb_context->dentry->d_inode;
+	struct inode *inode = req->wb_context->path.dentry->d_inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	spin_lock(&nfsi->req_lock);
@@ -789,7 +789,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 	 * NB: take care not to mess about with data->commit et al. */
 
 	data->req = req;
-	data->inode = inode = req->wb_context->dentry->d_inode;
+	data->inode = inode = req->wb_context->path.dentry->d_inode;
 	data->cred = req->wb_context->cred;
 
 	data->args.fh     = NFS_FH(inode);
@@ -957,8 +957,8 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
 	struct page		*page = req->wb_page;
 
 	dprintk("NFS: write (%s/%Ld %d@%Ld)",
-		req->wb_context->dentry->d_inode->i_sb->s_id,
-		(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+		req->wb_context->path.dentry->d_inode->i_sb->s_id,
+		(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
 		req->wb_bytes,
 		(long long)req_offset(req));
 
@@ -1023,8 +1023,8 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
 		page = req->wb_page;
 
 		dprintk("NFS: write (%s/%Ld %d@%Ld)",
-			req->wb_context->dentry->d_inode->i_sb->s_id,
-			(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+			req->wb_context->path.dentry->d_inode->i_sb->s_id,
+			(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
 			req->wb_bytes,
 			(long long)req_offset(req));
 
@@ -1162,7 +1162,7 @@ static void nfs_commit_rpcsetup(struct list_head *head,
 
 	list_splice_init(head, &data->pages);
 	first = nfs_list_entry(data->pages.next);
-	inode = first->wb_context->dentry->d_inode;
+	inode = first->wb_context->path.dentry->d_inode;
 
 	data->inode	  = inode;
 	data->cred	  = first->wb_context->cred;
@@ -1239,8 +1239,8 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 
 		dprintk("NFS: commit (%s/%Ld %d@%Ld)",
-			req->wb_context->dentry->d_inode->i_sb->s_id,
-			(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+			req->wb_context->path.dentry->d_inode->i_sb->s_id,
+			(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
 			req->wb_bytes,
 			(long long)req_offset(req));
 		if (task->tk_status < 0) {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 0543439a97af..07eea8f64ecf 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -31,6 +31,7 @@
 
 #include <linux/in.h>
 #include <linux/mm.h>
+#include <linux/namei.h>
 #include <linux/pagemap.h>
 #include <linux/rbtree.h>
 #include <linux/rwsem.h>
@@ -70,8 +71,7 @@ struct nfs_access_entry {
 struct nfs4_state;
 struct nfs_open_context {
 	atomic_t count;
-	struct vfsmount *vfsmnt;
-	struct dentry *dentry;
+	struct path path;
 	struct rpc_cred *cred;
 	struct nfs4_state *state;
 	fl_owner_t lockowner;
-- 
cgit v1.3-8-gc7d7


From aa53ed541a1fec78a78d02afc8b042d040cc080d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 5 Jun 2007 14:49:03 -0400
Subject: NFS4: on a O_EXCL OPEN make sure SETATTR sets the fields holding the
 verifier

The Linux NFS4 client simply skips over the bitmask in an O_EXCL open
call and so it doesn't bother to reset any fields that may be holding
the verifier. This patch has us save the first two words of the bitmask
(which is all the current client has #defines for). The client then
later checks this bitmask and turns on the appropriate flags in the
sattr->ia_verify field for the following SETATTR call.

This patch only currently checks to see if the server used the atime
and mtime slots for the verifier (which is what the Linux server uses
for this). I'm not sure of what other fields the server could
reasonably use, but adding checks for others should be trivial.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 20 ++++++++++++++++++++
 fs/nfs/nfs4xdr.c        |  9 +++++++--
 include/linux/nfs4.h    |  1 +
 include/linux/nfs_xdr.h |  1 +
 4 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3cc75445a68d..fee2d14b158b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -942,6 +942,22 @@ static struct nfs4_state *nfs4_open_delegated(struct inode *inode, int flags, st
 	return res;
 }
 
+/*
+ * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-*
+ * fields corresponding to attributes that were used to store the verifier.
+ * Make sure we clobber those fields in the later setattr call
+ */
+static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr)
+{
+	if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
+	    !(sattr->ia_valid & ATTR_ATIME_SET))
+		sattr->ia_valid |= ATTR_ATIME;
+
+	if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
+	    !(sattr->ia_valid & ATTR_MTIME_SET))
+		sattr->ia_valid |= ATTR_MTIME;
+}
+
 /*
  * Returns a referenced nfs4_state
  */
@@ -973,6 +989,9 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct
 	if (status != 0)
 		goto err_opendata_free;
 
+	if (opendata->o_arg.open_flags & O_EXCL)
+		nfs4_exclusive_attrset(opendata, sattr);
+
 	status = -ENOMEM;
 	state = nfs4_opendata_to_nfs4_state(opendata);
 	if (state == NULL)
@@ -1784,6 +1803,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		status = nfs4_do_setattr(state->inode, &fattr, sattr, state);
 		if (status == 0)
 			nfs_setattr_update_inode(state->inode, sattr);
+		nfs_post_op_update_inode(state->inode, &fattr);
 	}
 	if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
 		status = nfs4_intent_set_file(nd, &path, state);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 8003c91ccb9a..1fcca516e6ee 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3269,7 +3269,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 {
         __be32 *p;
-        uint32_t bmlen;
+	uint32_t savewords, bmlen, i;
         int status;
 
         status = decode_op_hdr(xdr, OP_OPEN);
@@ -3287,7 +3287,12 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
                 goto xdr_error;
 
         READ_BUF(bmlen << 2);
-        p += bmlen;
+	savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
+	for (i = 0; i < savewords; ++i)
+		READ32(res->attrset[i]);
+	for (; i < NFS4_BITMAP_SIZE; i++)
+		res->attrset[i] = 0;
+
 	return decode_delegation(xdr, res);
 xdr_error:
 	dprintk("%s: Bitmap too large! Length = %u\n", __FUNCTION__, bmlen);
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 7e7f33a38fc0..8726491de154 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -15,6 +15,7 @@
 
 #include <linux/types.h>
 
+#define NFS4_BITMAP_SIZE	2
 #define NFS4_VERIFIER_SIZE	8
 #define NFS4_STATEID_SIZE	16
 #define NFS4_FHSIZE		128
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 10c26ed0db71..f7100df3a690 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -144,6 +144,7 @@ struct nfs_openres {
 	nfs4_stateid		delegation;
 	__u32			do_recall;
 	__u64			maxsize;
+	__u32			attrset[NFS4_BITMAP_SIZE];
 };
 
 /*
-- 
cgit v1.3-8-gc7d7


From c03b40246123b2ced79e2620d1d2c089bb12369a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 17 Jun 2007 13:26:38 -0400
Subject: NFS: Convert struct nfs_page to use krefs

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pagelist.c        | 14 ++++++++------
 fs/nfs/write.c           |  6 +++---
 include/linux/nfs_page.h | 10 +++++-----
 3 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index f8a4ba533930..257a7f8b2362 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -85,9 +85,8 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 	req->wb_offset  = offset;
 	req->wb_pgbase	= offset;
 	req->wb_bytes   = count;
-	atomic_set(&req->wb_count, 1);
 	req->wb_context = get_nfs_open_context(ctx);
-
+	kref_init(&req->wb_kref);
 	return req;
 }
 
@@ -160,11 +159,9 @@ void nfs_clear_request(struct nfs_page *req)
  *
  * Note: Should never be called with the spinlock held!
  */
-void
-nfs_release_request(struct nfs_page *req)
+static void nfs_free_request(struct kref *kref)
 {
-	if (!atomic_dec_and_test(&req->wb_count))
-		return;
+	struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
 
 	/* Release struct file or cached credential */
 	nfs_clear_request(req);
@@ -172,6 +169,11 @@ nfs_release_request(struct nfs_page *req)
 	nfs_page_free(req);
 }
 
+void nfs_release_request(struct nfs_page *req)
+{
+	kref_put(&req->wb_kref, nfs_free_request);
+}
+
 static int nfs_wait_bit_interruptible(void *word)
 {
 	int ret = 0;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9e7c21da864f..e9404328ac02 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -117,7 +117,7 @@ static struct nfs_page *nfs_page_find_request_locked(struct page *page)
 	if (PagePrivate(page)) {
 		req = (struct nfs_page *)page_private(page);
 		if (req != NULL)
-			atomic_inc(&req->wb_count);
+			kref_get(&req->wb_kref);
 	}
 	return req;
 }
@@ -398,7 +398,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 	if (PageDirty(req->wb_page))
 		set_bit(PG_NEED_FLUSH, &req->wb_flags);
 	nfsi->npages++;
-	atomic_inc(&req->wb_count);
+	kref_get(&req->wb_kref);
 	return 0;
 }
 
@@ -531,7 +531,7 @@ static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, u
 		next = req->wb_index + 1;
 		BUG_ON(!NFS_WBACK_BUSY(req));
 
-		atomic_inc(&req->wb_count);
+		kref_get(&req->wb_kref);
 		spin_unlock(&nfsi->req_lock);
 		error = nfs_wait_on_request(req);
 		nfs_release_request(req);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index bd193af80162..c780e7e39f99 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -16,7 +16,7 @@
 #include <linux/sunrpc/auth.h>
 #include <linux/nfs_xdr.h>
 
-#include <asm/atomic.h>
+#include <linux/kref.h>
 
 /*
  * Valid flags for the radix tree
@@ -42,7 +42,7 @@ struct nfs_page {
 	unsigned int		wb_offset,	/* Offset & ~PAGE_CACHE_MASK */
 				wb_pgbase,	/* Start of page data */
 				wb_bytes;	/* Length of request */
-	atomic_t		wb_count;	/* reference count */
+	struct kref		wb_kref;	/* reference count */
 	unsigned long		wb_flags;
 	struct nfs_writeverf	wb_verf;	/* Commit cookie */
 };
@@ -89,7 +89,7 @@ extern  void nfs_clear_page_writeback(struct nfs_page *req);
 
 
 /*
- * Lock the page of an asynchronous request without incrementing the wb_count
+ * Lock the page of an asynchronous request without getting a new reference
  */
 static inline int
 nfs_lock_request_dontget(struct nfs_page *req)
@@ -98,14 +98,14 @@ nfs_lock_request_dontget(struct nfs_page *req)
 }
 
 /*
- * Lock the page of an asynchronous request
+ * Lock the page of an asynchronous request and take a reference
  */
 static inline int
 nfs_lock_request(struct nfs_page *req)
 {
 	if (test_and_set_bit(PG_BUSY, &req->wb_flags))
 		return 0;
-	atomic_inc(&req->wb_count);
+	kref_get(&req->wb_kref);
 	return 1;
 }
 
-- 
cgit v1.3-8-gc7d7


From 9fd367f0f376ccfb2592eed9be0eece70429894f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 17 Jun 2007 15:10:24 -0400
Subject: NFS cleanup: Rename NFS_PAGE_TAG_WRITEBACK to NFS_PAGE_TAG_LOCKED

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pagelist.c        | 14 +++++++-------
 fs/nfs/write.c           | 16 ++++++++--------
 include/linux/nfs_page.h |  5 ++---
 3 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 257a7f8b2362..23e9dea20902 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -108,29 +108,29 @@ void nfs_unlock_request(struct nfs_page *req)
 }
 
 /**
- * nfs_set_page_writeback_locked - Lock a request for writeback
+ * nfs_set_page_tag_locked - Tag a request as locked
  * @req:
  */
-int nfs_set_page_writeback_locked(struct nfs_page *req)
+static int nfs_set_page_tag_locked(struct nfs_page *req)
 {
 	struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
 
 	if (!nfs_lock_request(req))
 		return 0;
-	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_WRITEBACK);
+	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
 	return 1;
 }
 
 /**
- * nfs_clear_page_writeback - Unlock request and wake up sleepers
+ * nfs_clear_page_tag_locked - Clear request tag and wake up sleepers
  */
-void nfs_clear_page_writeback(struct nfs_page *req)
+void nfs_clear_page_tag_locked(struct nfs_page *req)
 {
 	struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
 
 	if (req->wb_page != NULL) {
 		spin_lock(&nfsi->req_lock);
-		radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_WRITEBACK);
+		radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
 		spin_unlock(&nfsi->req_lock);
 	}
 	nfs_unlock_request(req);
@@ -421,7 +421,7 @@ int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head,
 			idx_start = req->wb_index + 1;
 			if (req->wb_list_head != head)
 				continue;
-			if (nfs_set_page_writeback_locked(req)) {
+			if (nfs_set_page_tag_locked(req)) {
 				nfs_list_remove_request(req);
 				nfs_list_add_request(req, dst);
 				res++;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e9404328ac02..754066cc9146 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -289,7 +289,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 		BUG();
 	}
 	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
-			NFS_PAGE_TAG_WRITEBACK);
+			NFS_PAGE_TAG_LOCKED);
 	ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
 	spin_unlock(req_lock);
 	nfs_pageio_add_request(pgio, req);
@@ -524,7 +524,7 @@ static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, u
 		idx_end = idx_start + npages - 1;
 
 	next = idx_start;
-	while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_WRITEBACK)) {
+	while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_LOCKED)) {
 		if (req->wb_index > idx_end)
 			break;
 
@@ -759,7 +759,7 @@ static void nfs_writepage_release(struct nfs_page *req)
 		nfs_inode_remove_request(req);
 	} else
 		nfs_end_page_writeback(req->wb_page);
-	nfs_clear_page_writeback(req);
+	nfs_clear_page_tag_locked(req);
 }
 
 static inline int flush_task_priority(int how)
@@ -888,7 +888,7 @@ out_bad:
 	}
 	nfs_redirty_request(req);
 	nfs_end_page_writeback(req->wb_page);
-	nfs_clear_page_writeback(req);
+	nfs_clear_page_tag_locked(req);
 	return -ENOMEM;
 }
 
@@ -931,7 +931,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
 		nfs_list_remove_request(req);
 		nfs_redirty_request(req);
 		nfs_end_page_writeback(req->wb_page);
-		nfs_clear_page_writeback(req);
+		nfs_clear_page_tag_locked(req);
 	}
 	return -ENOMEM;
 }
@@ -1049,7 +1049,7 @@ remove_request:
 		nfs_end_page_writeback(page);
 		nfs_inode_remove_request(req);
 	next:
-		nfs_clear_page_writeback(req);
+		nfs_clear_page_tag_locked(req);
 	}
 }
 
@@ -1212,7 +1212,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 		nfs_list_remove_request(req);
 		nfs_mark_request_commit(req);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-		nfs_clear_page_writeback(req);
+		nfs_clear_page_tag_locked(req);
 	}
 	return -ENOMEM;
 }
@@ -1265,7 +1265,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
 		dprintk(" mismatch\n");
 		nfs_redirty_request(req);
 	next:
-		nfs_clear_page_writeback(req);
+		nfs_clear_page_tag_locked(req);
 	}
 }
 
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index c780e7e39f99..042434c39b7e 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -21,7 +21,7 @@
 /*
  * Valid flags for the radix tree
  */
-#define NFS_PAGE_TAG_WRITEBACK	0
+#define NFS_PAGE_TAG_LOCKED	0
 
 /*
  * Valid flags for a dirty buffer
@@ -84,8 +84,7 @@ extern	void nfs_pageio_complete(struct nfs_pageio_descriptor *desc);
 extern	void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *, pgoff_t);
 extern  int nfs_wait_on_request(struct nfs_page *);
 extern	void nfs_unlock_request(struct nfs_page *req);
-extern  int nfs_set_page_writeback_locked(struct nfs_page *req);
-extern  void nfs_clear_page_writeback(struct nfs_page *req);
+extern  void nfs_clear_page_tag_locked(struct nfs_page *req);
 
 
 /*
-- 
cgit v1.3-8-gc7d7


From 5c36968343fcd013a3f7ae93f246c2e75596780b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 17 Jun 2007 15:27:42 -0400
Subject: NFS cleanup: speed up nfs_scan_commit using radix tree tags

Add a tag for requests that are waiting for a COMMIT

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pagelist.c        | 14 +++++++-------
 fs/nfs/write.c           |  6 +++++-
 include/linux/nfs_page.h |  5 +++--
 3 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 23e9dea20902..ad90cbe76703 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -381,10 +381,10 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
 /**
  * nfs_scan_list - Scan a list for matching requests
  * @nfsi: NFS inode
- * @head: One of the NFS inode request lists
  * @dst: Destination list
  * @idx_start: lower bound of page->index to scan
  * @npages: idx_start + npages sets the upper bound to scan.
+ * @tag: tag to scan for
  *
  * Moves elements from one of the inode request lists.
  * If the number of requests is set to 0, the entire address_space
@@ -392,9 +392,9 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
  * The requests are *not* checked to ensure that they form a contiguous set.
  * You must be holding the inode's req_lock when calling this function
  */
-int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head,
+int nfs_scan_list(struct nfs_inode *nfsi,
 		struct list_head *dst, pgoff_t idx_start,
-		unsigned int npages)
+		unsigned int npages, int tag)
 {
 	struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
 	struct nfs_page *req;
@@ -409,9 +409,9 @@ int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head,
 		idx_end = idx_start + npages - 1;
 
 	for (;;) {
-		found = radix_tree_gang_lookup(&nfsi->nfs_page_tree,
+		found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree,
 				(void **)&pgvec[0], idx_start,
-				NFS_SCAN_MAXENTRIES);
+				NFS_SCAN_MAXENTRIES, tag);
 		if (found <= 0)
 			break;
 		for (i = 0; i < found; i++) {
@@ -419,10 +419,10 @@ int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head,
 			if (req->wb_index > idx_end)
 				goto out;
 			idx_start = req->wb_index + 1;
-			if (req->wb_list_head != head)
-				continue;
 			if (nfs_set_page_tag_locked(req)) {
 				nfs_list_remove_request(req);
+				radix_tree_tag_clear(&nfsi->nfs_page_tree,
+						req->wb_index, tag);
 				nfs_list_add_request(req, dst);
 				res++;
 			}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 754066cc9146..0f779ca12ec3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -462,6 +462,9 @@ nfs_mark_request_commit(struct nfs_page *req)
 	nfs_list_add_request(req, &nfsi->commit);
 	nfsi->ncommit++;
 	set_bit(PG_NEED_COMMIT, &(req)->wb_flags);
+	radix_tree_tag_set(&nfsi->nfs_page_tree,
+			req->wb_index,
+			NFS_PAGE_TAG_COMMIT);
 	spin_unlock(&nfsi->req_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
@@ -575,7 +578,8 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, u
 	int res = 0;
 
 	if (nfsi->ncommit != 0) {
-		res = nfs_scan_list(nfsi, &nfsi->commit, dst, idx_start, npages);
+		res = nfs_scan_list(nfsi, dst, idx_start, npages,
+				NFS_PAGE_TAG_COMMIT);
 		nfsi->ncommit -= res;
 		if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit))
 			printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 042434c39b7e..481a42105d69 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -22,6 +22,7 @@
  * Valid flags for the radix tree
  */
 #define NFS_PAGE_TAG_LOCKED	0
+#define NFS_PAGE_TAG_COMMIT	1
 
 /*
  * Valid flags for a dirty buffer
@@ -71,8 +72,8 @@ extern	void nfs_clear_request(struct nfs_page *req);
 extern	void nfs_release_request(struct nfs_page *req);
 
 
-extern	int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head, struct list_head *dst,
-			  pgoff_t idx_start, unsigned int npages);
+extern	int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst,
+			  pgoff_t idx_start, unsigned int npages, int tag);
 extern	void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 			     struct inode *inode,
 			     int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
-- 
cgit v1.3-8-gc7d7


From 2aefa104313996d1a9582476cee53d1296c834bf Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 17 Jun 2007 15:40:59 -0400
Subject: NFS: Remove the redundant 'dirty' and 'commit' lists from nfs_inode

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c           | 3 ---
 fs/nfs/write.c           | 3 ---
 include/linux/nfs_fs.h   | 5 +----
 include/linux/nfs_page.h | 5 +----
 4 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7bcb3dfa6179..e7d2bba900b5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1149,14 +1149,11 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 
 	inode_init_once(&nfsi->vfs_inode);
 	spin_lock_init(&nfsi->req_lock);
-	INIT_LIST_HEAD(&nfsi->dirty);
-	INIT_LIST_HEAD(&nfsi->commit);
 	INIT_LIST_HEAD(&nfsi->open_files);
 	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
 	INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
 	atomic_set(&nfsi->data_updates, 0);
-	nfsi->ndirty = 0;
 	nfsi->ncommit = 0;
 	nfsi->npages = 0;
 	nfs4_init_once(nfsi);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0f779ca12ec3..9ef9ec746bfb 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -459,7 +459,6 @@ nfs_mark_request_commit(struct nfs_page *req)
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	spin_lock(&nfsi->req_lock);
-	nfs_list_add_request(req, &nfsi->commit);
 	nfsi->ncommit++;
 	set_bit(PG_NEED_COMMIT, &(req)->wb_flags);
 	radix_tree_tag_set(&nfsi->nfs_page_tree,
@@ -581,8 +580,6 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, u
 		res = nfs_scan_list(nfsi, dst, idx_start, npages,
 				NFS_PAGE_TAG_COMMIT);
 		nfsi->ncommit -= res;
-		if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit))
-			printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
 	}
 	return res;
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 07eea8f64ecf..a94205476736 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -156,12 +156,9 @@ struct nfs_inode {
 	 * This is the list of dirty unwritten pages.
 	 */
 	spinlock_t		req_lock;
-	struct list_head	dirty;
-	struct list_head	commit;
 	struct radix_tree_root	nfs_page_tree;
 
-	unsigned int		ndirty,
-				ncommit,
+	unsigned int		ncommit,
 				npages;
 
 	/* Open contexts for shared mmap writes */
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 481a42105d69..78e60798d10e 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -34,8 +34,7 @@
 
 struct nfs_inode;
 struct nfs_page {
-	struct list_head	wb_list,	/* Defines state of page: */
-				*wb_list_head;	/*      read/write/commit */
+	struct list_head	wb_list;	/* Defines state of page: */
 	struct page		*wb_page;	/* page to read in/write out */
 	struct nfs_open_context	*wb_context;	/* File state context info */
 	atomic_t		wb_complete;	/* i/os we're waiting for */
@@ -118,7 +117,6 @@ static inline void
 nfs_list_add_request(struct nfs_page *req, struct list_head *head)
 {
 	list_add_tail(&req->wb_list, head);
-	req->wb_list_head = head;
 }
 
 
@@ -132,7 +130,6 @@ nfs_list_remove_request(struct nfs_page *req)
 	if (list_empty(&req->wb_list))
 		return;
 	list_del_init(&req->wb_list);
-	req->wb_list_head = NULL;
 }
 
 static inline struct nfs_page *
-- 
cgit v1.3-8-gc7d7


From dce34ce298d85b81630401f4feb4bd7ac77fe9c7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 17 Jun 2007 15:47:53 -0400
Subject: NFS: Prevent integer overflow in nfs_scan_list()

Also ensure that nfs_inode ncommit and npages are large enough to represent
all possible values for the number of pages.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pagelist.c      | 2 ++
 include/linux/nfs_fs.h | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ad90cbe76703..68f6bf122006 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -425,6 +425,8 @@ int nfs_scan_list(struct nfs_inode *nfsi,
 						req->wb_index, tag);
 				nfs_list_add_request(req, dst);
 				res++;
+				if (res == INT_MAX)
+					goto out;
 			}
 		}
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index a94205476736..750708ccd708 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -158,7 +158,7 @@ struct nfs_inode {
 	spinlock_t		req_lock;
 	struct radix_tree_root	nfs_page_tree;
 
-	unsigned int		ncommit,
+	unsigned long		ncommit,
 				npages;
 
 	/* Open contexts for shared mmap writes */
-- 
cgit v1.3-8-gc7d7


From 3bec63db55463365110d00721ed60a31e4614cb6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 17 Jun 2007 16:02:44 -0400
Subject: NFS: Convert struct nfs_open_context to use a kref

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         | 40 +++++++++++++++++++++++-----------------
 include/linux/nfs_fs.h |  3 ++-
 2 files changed, 25 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e7d2bba900b5..01fc8ab0c562 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -461,7 +461,6 @@ static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, str
 
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 	if (ctx != NULL) {
-		atomic_set(&ctx->count, 1);
 		ctx->path.dentry = dget(dentry);
 		ctx->path.mnt = mntget(mnt);
 		ctx->cred = get_rpccred(cred);
@@ -469,6 +468,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, str
 		ctx->lockowner = current->files;
 		ctx->error = 0;
 		ctx->dir_cookie = 0;
+		kref_init(&ctx->kref);
 	}
 	return ctx;
 }
@@ -476,27 +476,33 @@ static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, str
 struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
 {
 	if (ctx != NULL)
-		atomic_inc(&ctx->count);
+		kref_get(&ctx->kref);
 	return ctx;
 }
 
-void put_nfs_open_context(struct nfs_open_context *ctx)
+static void nfs_free_open_context(struct kref *kref)
 {
-	if (atomic_dec_and_test(&ctx->count)) {
-		if (!list_empty(&ctx->list)) {
-			struct inode *inode = ctx->path.dentry->d_inode;
-			spin_lock(&inode->i_lock);
-			list_del(&ctx->list);
-			spin_unlock(&inode->i_lock);
-		}
-		if (ctx->state != NULL)
-			nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
-		if (ctx->cred != NULL)
-			put_rpccred(ctx->cred);
-		dput(ctx->path.dentry);
-		mntput(ctx->path.mnt);
-		kfree(ctx);
+	struct nfs_open_context *ctx = container_of(kref,
+			struct nfs_open_context, kref);
+
+	if (!list_empty(&ctx->list)) {
+		struct inode *inode = ctx->path.dentry->d_inode;
+		spin_lock(&inode->i_lock);
+		list_del(&ctx->list);
+		spin_unlock(&inode->i_lock);
 	}
+	if (ctx->state != NULL)
+		nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
+	if (ctx->cred != NULL)
+		put_rpccred(ctx->cred);
+	dput(ctx->path.dentry);
+	mntput(ctx->path.mnt);
+	kfree(ctx);
+}
+
+void put_nfs_open_context(struct nfs_open_context *ctx)
+{
+	kref_put(&ctx->kref, nfs_free_open_context);
 }
 
 /*
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 750708ccd708..bf24151d63be 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -30,6 +30,7 @@
 #ifdef __KERNEL__
 
 #include <linux/in.h>
+#include <linux/kref.h>
 #include <linux/mm.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
@@ -70,7 +71,7 @@ struct nfs_access_entry {
 
 struct nfs4_state;
 struct nfs_open_context {
-	atomic_t count;
+	struct kref kref;
 	struct path path;
 	struct rpc_cred *cred;
 	struct nfs4_state *state;
-- 
cgit v1.3-8-gc7d7


From 6529eba08fe7297852391a468d95322913de73fa Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 14 Jun 2007 16:40:14 -0400
Subject: SUNRPC: Move rpc_task->tk_task list into struct rpc_clnt

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h  |   4 ++
 include/linux/sunrpc/sched.h |   5 --
 net/sunrpc/clnt.c            |   5 ++
 net/sunrpc/sched.c           | 117 +++++++++++++++++++++++++++----------------
 4 files changed, 83 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 66611423c8ee..0801ab5407ce 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -26,6 +26,8 @@ struct rpc_inode;
 struct rpc_clnt {
 	atomic_t		cl_count;	/* Number of clones */
 	atomic_t		cl_users;	/* number of references */
+	struct list_head	cl_clients;	/* Global list of clients */
+	struct list_head	cl_tasks;	/* List of tasks */
 	struct rpc_xprt *	cl_xprt;	/* transport */
 	struct rpc_procinfo *	cl_procinfo;	/* procedure info */
 	u32			cl_prog,	/* RPC program number */
@@ -122,6 +124,8 @@ struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
 int		rpc_shutdown_client(struct rpc_clnt *);
 int		rpc_destroy_client(struct rpc_clnt *);
 void		rpc_release_client(struct rpc_clnt *);
+void		rpc_register_client(struct rpc_clnt *);
+void		rpc_unregister_client(struct rpc_clnt *);
 int		rpcb_register(u32, u32, int, unsigned short, int *);
 void		rpcb_getport(struct rpc_task *);
 
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 2047fb202a13..3387b008cdfc 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -110,11 +110,6 @@ struct rpc_task {
 	if (!list_empty(head) &&  \
 	    ((task=list_entry((head)->next, struct rpc_task, u.tk_wait.list)),1))
 
-/* .. and walking list of all tasks */
-#define	alltask_for_each(task, pos, head) \
-	list_for_each(pos, head) \
-		if ((task=list_entry(pos, struct rpc_task, tk_task)),1)
-
 typedef void			(*rpc_action)(struct rpc_task *);
 
 struct rpc_call_ops {
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index d8fbee40a19c..6631ece14983 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -148,6 +148,7 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s
 	if (clnt->cl_metrics == NULL)
 		goto out_no_stats;
 	clnt->cl_program  = program;
+	INIT_LIST_HEAD(&clnt->cl_tasks);
 
 	if (!xprt_bound(clnt->cl_xprt))
 		clnt->cl_autobind = 1;
@@ -172,6 +173,7 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s
 	if (clnt->cl_nodelen > UNX_MAXNODENAME)
 		clnt->cl_nodelen = UNX_MAXNODENAME;
 	memcpy(clnt->cl_nodename, utsname()->nodename, clnt->cl_nodelen);
+	rpc_register_client(clnt);
 	return clnt;
 
 out_no_auth:
@@ -283,9 +285,11 @@ rpc_clone_client(struct rpc_clnt *clnt)
 	new->cl_autobind = 0;
 	new->cl_oneshot = 0;
 	new->cl_dead = 0;
+	INIT_LIST_HEAD(&new->cl_tasks);
 	rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval);
 	if (new->cl_auth)
 		atomic_inc(&new->cl_auth->au_count);
+	rpc_register_client(new);
 	return new;
 out_no_path:
 	rpc_free_iostats(new->cl_metrics);
@@ -357,6 +361,7 @@ rpc_destroy_client(struct rpc_clnt *clnt)
 	if (clnt->cl_server != clnt->cl_inline_name)
 		kfree(clnt->cl_server);
 out_free:
+	rpc_unregister_client(clnt);
 	rpc_free_iostats(clnt->cl_metrics);
 	clnt->cl_metrics = NULL;
 	xprt_put(clnt->cl_xprt);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 944d75396fb3..6309f3b52c53 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -50,9 +50,10 @@ static void			 rpc_release_task(struct rpc_task *task);
 static RPC_WAITQ(delay_queue, "delayq");
 
 /*
- * All RPC tasks are linked into this list
+ * All RPC clients are linked into this list
  */
-static LIST_HEAD(all_tasks);
+static LIST_HEAD(all_clients);
+static DECLARE_WAIT_QUEUE_HEAD(client_kill_wait);
 
 /*
  * rpciod-related stuff
@@ -277,7 +278,8 @@ static void rpc_set_active(struct rpc_task *task)
 	task->tk_pid = rpc_task_id++;
 #endif
 	/* Add to global list of all tasks */
-	list_add_tail(&task->tk_task, &all_tasks);
+	if (task->tk_client)
+		list_add_tail(&task->tk_task, &task->tk_client->cl_tasks);
 	spin_unlock(&rpc_sched_lock);
 }
 
@@ -818,6 +820,7 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, int flags, cons
 	if (tk_ops->rpc_call_prepare != NULL)
 		task->tk_action = rpc_prepare_task;
 	task->tk_calldata = calldata;
+	INIT_LIST_HEAD(&task->tk_task);
 
 	/* Initialize retry counters */
 	task->tk_garb_retry = 2;
@@ -920,11 +923,12 @@ static void rpc_release_task(struct rpc_task *task)
 #endif
 	dprintk("RPC: %5u release task\n", task->tk_pid);
 
-	/* Remove from global task list */
-	spin_lock(&rpc_sched_lock);
-	list_del(&task->tk_task);
-	spin_unlock(&rpc_sched_lock);
-
+	if (!list_empty(&task->tk_task)) {
+		/* Remove from client task list */
+		spin_lock(&rpc_sched_lock);
+		list_del(&task->tk_task);
+		spin_unlock(&rpc_sched_lock);
+	}
 	BUG_ON (RPC_IS_QUEUED(task));
 
 	/* Synchronously delete any running timer */
@@ -966,42 +970,52 @@ EXPORT_SYMBOL(rpc_run_task);
  * Kill all tasks for the given client.
  * XXX: kill their descendants as well?
  */
-void rpc_killall_tasks(struct rpc_clnt *clnt)
+static void rpc_killall_tasks_locked(struct list_head *head)
 {
 	struct rpc_task	*rovr;
-	struct list_head *le;
 
-	dprintk("RPC:       killing all tasks for client %p\n", clnt);
 
-	/*
-	 * Spin lock all_tasks to prevent changes...
-	 */
-	spin_lock(&rpc_sched_lock);
-	alltask_for_each(rovr, le, &all_tasks) {
+	list_for_each_entry(rovr, head, tk_task) {
 		if (! RPC_IS_ACTIVATED(rovr))
 			continue;
-		if (!clnt || rovr->tk_client == clnt) {
+		if (!(rovr->tk_flags & RPC_TASK_KILLED)) {
 			rovr->tk_flags |= RPC_TASK_KILLED;
 			rpc_exit(rovr, -EIO);
 			rpc_wake_up_task(rovr);
 		}
 	}
+}
+
+void rpc_killall_tasks(struct rpc_clnt *clnt)
+{
+	dprintk("RPC:       killing all tasks for client %p\n", clnt);
+	/*
+	 * Spin lock all_tasks to prevent changes...
+	 */
+	spin_lock(&rpc_sched_lock);
+	rpc_killall_tasks_locked(&clnt->cl_tasks);
 	spin_unlock(&rpc_sched_lock);
 }
 
 static void rpciod_killall(void)
 {
+	struct rpc_clnt *clnt;
 	unsigned long flags;
 
-	while (!list_empty(&all_tasks)) {
+	for(;;) {
 		clear_thread_flag(TIF_SIGPENDING);
-		rpc_killall_tasks(NULL);
+
+		spin_lock(&rpc_sched_lock);
+		list_for_each_entry(clnt, &all_clients, cl_clients)
+			rpc_killall_tasks_locked(&clnt->cl_tasks);
+		spin_unlock(&rpc_sched_lock);
 		flush_workqueue(rpciod_workqueue);
-		if (!list_empty(&all_tasks)) {
-			dprintk("RPC:       rpciod_killall: waiting for tasks "
+		if (!list_empty(&all_clients))
+			break;
+		dprintk("RPC:       rpciod_killall: waiting for tasks "
 					"to exit\n");
-			yield();
-		}
+		wait_event_timeout(client_kill_wait,
+				list_empty(&all_clients), 1*HZ);
 	}
 
 	spin_lock_irqsave(&current->sighand->siglock, flags);
@@ -1009,6 +1023,22 @@ static void rpciod_killall(void)
 	spin_unlock_irqrestore(&current->sighand->siglock, flags);
 }
 
+void rpc_register_client(struct rpc_clnt *clnt)
+{
+	spin_lock(&rpc_sched_lock);
+	list_add(&clnt->cl_clients, &all_clients);
+	spin_unlock(&rpc_sched_lock);
+}
+
+void rpc_unregister_client(struct rpc_clnt *clnt)
+{
+	spin_lock(&rpc_sched_lock);
+	list_del(&clnt->cl_clients);
+	if (list_empty(&all_clients))
+		wake_up(&client_kill_wait);
+	spin_unlock(&rpc_sched_lock);
+}
+
 /*
  * Start up the rpciod process if it's not already running.
  */
@@ -1071,32 +1101,33 @@ rpciod_down(void)
 #ifdef RPC_DEBUG
 void rpc_show_tasks(void)
 {
-	struct list_head *le;
+	struct rpc_clnt *clnt;
 	struct rpc_task *t;
 
 	spin_lock(&rpc_sched_lock);
-	if (list_empty(&all_tasks)) {
-		spin_unlock(&rpc_sched_lock);
-		return;
-	}
+	if (list_empty(&all_clients))
+		goto out;
 	printk("-pid- proc flgs status -client- -prog- --rqstp- -timeout "
 		"-rpcwait -action- ---ops--\n");
-	alltask_for_each(t, le, &all_tasks) {
-		const char *rpc_waitq = "none";
-
-		if (RPC_IS_QUEUED(t))
-			rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq);
-
-		printk("%5u %04d %04x %6d %8p %6d %8p %8ld %8s %8p %8p\n",
-			t->tk_pid,
-			(t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1),
-			t->tk_flags, t->tk_status,
-			t->tk_client,
-			(t->tk_client ? t->tk_client->cl_prog : 0),
-			t->tk_rqstp, t->tk_timeout,
-			rpc_waitq,
-			t->tk_action, t->tk_ops);
+	list_for_each_entry(clnt, &all_clients, cl_clients) {
+		list_for_each_entry(t, &clnt->cl_tasks, tk_task) {
+			const char *rpc_waitq = "none";
+
+			if (RPC_IS_QUEUED(t))
+				rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq);
+
+			printk("%5u %04d %04x %6d %8p %6d %8p %8ld %8s %8p %8p\n",
+				t->tk_pid,
+				(t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1),
+				t->tk_flags, t->tk_status,
+				t->tk_client,
+				(t->tk_client ? t->tk_client->cl_prog : 0),
+				t->tk_rqstp, t->tk_timeout,
+				rpc_waitq,
+				t->tk_action, t->tk_ops);
+		}
 	}
+out:
 	spin_unlock(&rpc_sched_lock);
 }
 #endif
-- 
cgit v1.3-8-gc7d7


From 4bef61ff7514396419563ca54fd42ef846485b06 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 16 Jun 2007 14:17:01 -0400
Subject: SUNRPC: Add a per-rpc_clnt spinlock

Use that to protect the rpc_clnt->cl_tasks list instead of using a global
lock.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h |  1 +
 net/sunrpc/clnt.c           |  2 ++
 net/sunrpc/sched.c          | 47 ++++++++++++++++++++++++++-------------------
 3 files changed, 30 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 0801ab5407ce..2f4b520a7419 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -28,6 +28,7 @@ struct rpc_clnt {
 	atomic_t		cl_users;	/* number of references */
 	struct list_head	cl_clients;	/* Global list of clients */
 	struct list_head	cl_tasks;	/* List of tasks */
+	spinlock_t		cl_lock;	/* spinlock */
 	struct rpc_xprt *	cl_xprt;	/* transport */
 	struct rpc_procinfo *	cl_procinfo;	/* procedure info */
 	u32			cl_prog,	/* RPC program number */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 6631ece14983..424dfdc6862c 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -149,6 +149,7 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s
 		goto out_no_stats;
 	clnt->cl_program  = program;
 	INIT_LIST_HEAD(&clnt->cl_tasks);
+	spin_lock_init(&clnt->cl_lock);
 
 	if (!xprt_bound(clnt->cl_xprt))
 		clnt->cl_autobind = 1;
@@ -286,6 +287,7 @@ rpc_clone_client(struct rpc_clnt *clnt)
 	new->cl_oneshot = 0;
 	new->cl_dead = 0;
 	INIT_LIST_HEAD(&new->cl_tasks);
+	spin_lock_init(&new->cl_lock);
 	rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval);
 	if (new->cl_auth)
 		atomic_inc(&new->cl_auth->au_count);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 6309f3b52c53..f56ebc5a08f7 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -270,17 +270,22 @@ static int rpc_wait_bit_interruptible(void *word)
 
 static void rpc_set_active(struct rpc_task *task)
 {
+	struct rpc_clnt *clnt;
 	if (test_and_set_bit(RPC_TASK_ACTIVE, &task->tk_runstate) != 0)
 		return;
-	spin_lock(&rpc_sched_lock);
 #ifdef RPC_DEBUG
 	task->tk_magic = RPC_TASK_MAGIC_ID;
+	spin_lock(&rpc_sched_lock);
 	task->tk_pid = rpc_task_id++;
+	spin_unlock(&rpc_sched_lock);
 #endif
 	/* Add to global list of all tasks */
-	if (task->tk_client)
-		list_add_tail(&task->tk_task, &task->tk_client->cl_tasks);
-	spin_unlock(&rpc_sched_lock);
+	clnt = task->tk_client;
+	if (clnt != NULL) {
+		spin_lock(&clnt->cl_lock);
+		list_add_tail(&task->tk_task, &clnt->cl_tasks);
+		spin_unlock(&clnt->cl_lock);
+	}
 }
 
 /*
@@ -924,10 +929,11 @@ static void rpc_release_task(struct rpc_task *task)
 	dprintk("RPC: %5u release task\n", task->tk_pid);
 
 	if (!list_empty(&task->tk_task)) {
+		struct rpc_clnt *clnt = task->tk_client;
 		/* Remove from client task list */
-		spin_lock(&rpc_sched_lock);
+		spin_lock(&clnt->cl_lock);
 		list_del(&task->tk_task);
-		spin_unlock(&rpc_sched_lock);
+		spin_unlock(&clnt->cl_lock);
 	}
 	BUG_ON (RPC_IS_QUEUED(task));
 
@@ -970,12 +976,19 @@ EXPORT_SYMBOL(rpc_run_task);
  * Kill all tasks for the given client.
  * XXX: kill their descendants as well?
  */
-static void rpc_killall_tasks_locked(struct list_head *head)
+void rpc_killall_tasks(struct rpc_clnt *clnt)
 {
 	struct rpc_task	*rovr;
 
 
-	list_for_each_entry(rovr, head, tk_task) {
+	if (list_empty(&clnt->cl_tasks))
+		return;
+	dprintk("RPC:       killing all tasks for client %p\n", clnt);
+	/*
+	 * Spin lock all_tasks to prevent changes...
+	 */
+	spin_lock(&clnt->cl_lock);
+	list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) {
 		if (! RPC_IS_ACTIVATED(rovr))
 			continue;
 		if (!(rovr->tk_flags & RPC_TASK_KILLED)) {
@@ -984,17 +997,7 @@ static void rpc_killall_tasks_locked(struct list_head *head)
 			rpc_wake_up_task(rovr);
 		}
 	}
-}
-
-void rpc_killall_tasks(struct rpc_clnt *clnt)
-{
-	dprintk("RPC:       killing all tasks for client %p\n", clnt);
-	/*
-	 * Spin lock all_tasks to prevent changes...
-	 */
-	spin_lock(&rpc_sched_lock);
-	rpc_killall_tasks_locked(&clnt->cl_tasks);
-	spin_unlock(&rpc_sched_lock);
+	spin_unlock(&clnt->cl_lock);
 }
 
 static void rpciod_killall(void)
@@ -1007,7 +1010,7 @@ static void rpciod_killall(void)
 
 		spin_lock(&rpc_sched_lock);
 		list_for_each_entry(clnt, &all_clients, cl_clients)
-			rpc_killall_tasks_locked(&clnt->cl_tasks);
+			rpc_killall_tasks(clnt);
 		spin_unlock(&rpc_sched_lock);
 		flush_workqueue(rpciod_workqueue);
 		if (!list_empty(&all_clients))
@@ -1110,6 +1113,9 @@ void rpc_show_tasks(void)
 	printk("-pid- proc flgs status -client- -prog- --rqstp- -timeout "
 		"-rpcwait -action- ---ops--\n");
 	list_for_each_entry(clnt, &all_clients, cl_clients) {
+		if (list_empty(&clnt->cl_tasks))
+			continue;
+		spin_lock(&clnt->cl_lock);
 		list_for_each_entry(t, &clnt->cl_tasks, tk_task) {
 			const char *rpc_waitq = "none";
 
@@ -1126,6 +1132,7 @@ void rpc_show_tasks(void)
 				rpc_waitq,
 				t->tk_action, t->tk_ops);
 		}
+		spin_unlock(&clnt->cl_lock);
 	}
 out:
 	spin_unlock(&rpc_sched_lock);
-- 
cgit v1.3-8-gc7d7


From 34f52e3591f241b825353ba27def956d8487c400 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 14 Jun 2007 16:40:31 -0400
Subject: SUNRPC: Convert rpc_clnt->cl_users to a kref

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/host.c             | 12 +++-------
 include/linux/sunrpc/clnt.h |  2 +-
 net/sunrpc/clnt.c           | 57 ++++++++++++++++++++++-----------------------
 net/sunrpc/rpc_pipe.c       |  2 +-
 net/sunrpc/sched.c          |  6 ++---
 5 files changed, 35 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 96070bff93fc..c252a1c95857 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -161,15 +161,9 @@ nlm_destroy_host(struct nlm_host *host)
 	 */
 	nsm_unmonitor(host);
 
-	if ((clnt = host->h_rpcclnt) != NULL) {
-		if (atomic_read(&clnt->cl_users)) {
-			printk(KERN_WARNING
-				"lockd: active RPC handle\n");
-			clnt->cl_dead = 1;
-		} else {
-			rpc_destroy_client(host->h_rpcclnt);
-		}
-	}
+	clnt = host->h_rpcclnt;
+	if (clnt != NULL)
+		rpc_shutdown_client(clnt);
 	kfree(host);
 }
 
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 2f4b520a7419..003d8ea70c19 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -24,8 +24,8 @@ struct rpc_inode;
  * The high-level client handle
  */
 struct rpc_clnt {
+	struct kref		cl_kref;	/* Number of references */
 	atomic_t		cl_count;	/* Number of clones */
-	atomic_t		cl_users;	/* number of references */
 	struct list_head	cl_clients;	/* Global list of clients */
 	struct list_head	cl_tasks;	/* List of tasks */
 	spinlock_t		cl_lock;	/* spinlock */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 424dfdc6862c..254a6e1a5770 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -121,7 +121,6 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s
 	clnt = kzalloc(sizeof(*clnt), GFP_KERNEL);
 	if (!clnt)
 		goto out_err;
-	atomic_set(&clnt->cl_users, 0);
 	atomic_set(&clnt->cl_count, 1);
 	clnt->cl_parent = clnt;
 
@@ -157,6 +156,8 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s
 	clnt->cl_rtt = &clnt->cl_rtt_default;
 	rpc_init_rtt(&clnt->cl_rtt_default, xprt->timeout.to_initval);
 
+	kref_init(&clnt->cl_kref);
+
 	err = rpc_setup_pipedir(clnt, program->pipe_dir_name);
 	if (err < 0)
 		goto out_no_path;
@@ -272,10 +273,10 @@ rpc_clone_client(struct rpc_clnt *clnt)
 	if (!new)
 		goto out_no_clnt;
 	atomic_set(&new->cl_count, 1);
-	atomic_set(&new->cl_users, 0);
 	new->cl_metrics = rpc_alloc_iostats(clnt);
 	if (new->cl_metrics == NULL)
 		goto out_no_stats;
+	kref_init(&new->cl_kref);
 	err = rpc_setup_pipedir(new, clnt->cl_program->pipe_dir_name);
 	if (err != 0)
 		goto out_no_path;
@@ -311,40 +312,28 @@ out_no_clnt:
 int
 rpc_shutdown_client(struct rpc_clnt *clnt)
 {
-	dprintk("RPC:       shutting down %s client for %s, tasks=%d\n",
-			clnt->cl_protname, clnt->cl_server,
-			atomic_read(&clnt->cl_users));
+	dprintk("RPC:       shutting down %s client for %s\n",
+			clnt->cl_protname, clnt->cl_server);
 
-	while (atomic_read(&clnt->cl_users) > 0) {
+	while (!list_empty(&clnt->cl_tasks)) {
 		/* Don't let rpc_release_client destroy us */
 		clnt->cl_oneshot = 0;
 		clnt->cl_dead = 0;
 		rpc_killall_tasks(clnt);
 		wait_event_timeout(destroy_wait,
-			!atomic_read(&clnt->cl_users), 1*HZ);
-	}
-
-	if (atomic_read(&clnt->cl_users) < 0) {
-		printk(KERN_ERR "RPC: rpc_shutdown_client clnt %p tasks=%d\n",
-				clnt, atomic_read(&clnt->cl_users));
-#ifdef RPC_DEBUG
-		rpc_show_tasks();
-#endif
-		BUG();
+			list_empty(&clnt->cl_tasks), 1*HZ);
 	}
 
 	return rpc_destroy_client(clnt);
 }
 
 /*
- * Delete an RPC client
+ * Free an RPC client
  */
-int
-rpc_destroy_client(struct rpc_clnt *clnt)
+static void
+rpc_free_client(struct kref *kref)
 {
-	if (!atomic_dec_and_test(&clnt->cl_count))
-		return 1;
-	BUG_ON(atomic_read(&clnt->cl_users) != 0);
+	struct rpc_clnt *clnt = container_of(kref, struct rpc_clnt, cl_kref);
 
 	dprintk("RPC:       destroying %s client for %s\n",
 			clnt->cl_protname, clnt->cl_server);
@@ -368,23 +357,33 @@ out_free:
 	clnt->cl_metrics = NULL;
 	xprt_put(clnt->cl_xprt);
 	kfree(clnt);
-	return 0;
 }
 
 /*
- * Release an RPC client
+ * Release reference to the RPC client
  */
 void
 rpc_release_client(struct rpc_clnt *clnt)
 {
-	dprintk("RPC:       rpc_release_client(%p, %d)\n",
-			clnt, atomic_read(&clnt->cl_users));
+	dprintk("RPC:       rpc_release_client(%p)\n", clnt);
 
-	if (!atomic_dec_and_test(&clnt->cl_users))
-		return;
-	wake_up(&destroy_wait);
+	if (list_empty(&clnt->cl_tasks))
+		wake_up(&destroy_wait);
 	if (clnt->cl_oneshot || clnt->cl_dead)
 		rpc_destroy_client(clnt);
+	kref_put(&clnt->cl_kref, rpc_free_client);
+}
+
+/*
+ * Delete an RPC client
+ */
+int
+rpc_destroy_client(struct rpc_clnt *clnt)
+{
+	if (!atomic_dec_and_test(&clnt->cl_count))
+		return 1;
+	kref_put(&clnt->cl_kref, rpc_free_client);
+	return 0;
 }
 
 /**
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 5887457dc936..826190dacfce 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -344,7 +344,7 @@ rpc_info_open(struct inode *inode, struct file *file)
 		mutex_lock(&inode->i_mutex);
 		clnt = RPC_I(inode)->private;
 		if (clnt) {
-			atomic_inc(&clnt->cl_users);
+			kref_get(&clnt->cl_kref);
 			m->private = clnt;
 		} else {
 			single_release(inode, file);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 0e9fbbd4f987..bb12983580a0 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -846,7 +846,7 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, int flags, cons
 	task->tk_workqueue = rpciod_workqueue;
 
 	if (clnt) {
-		atomic_inc(&clnt->cl_users);
+		kref_get(&clnt->cl_kref);
 		if (clnt->cl_softrtry)
 			task->tk_flags |= RPC_TASK_SOFT;
 		if (!clnt->cl_intr)
@@ -898,9 +898,7 @@ out:
 cleanup:
 	/* Check whether to release the client */
 	if (clnt) {
-		printk("rpc_new_task: failed, users=%d, oneshot=%d\n",
-			atomic_read(&clnt->cl_users), clnt->cl_oneshot);
-		atomic_inc(&clnt->cl_users); /* pretend we were used ... */
+		kref_get(&clnt->cl_kref); /* pretend we were used ... */
 		rpc_release_client(clnt);
 	}
 	goto out;
-- 
cgit v1.3-8-gc7d7


From 848f1fe6be2e290691bb6c13cbb8fd92bd0cfaab Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 9 Jun 2007 19:39:12 -0400
Subject: SUNRPC: Kill rpc_clnt->cl_dead

Its use is at best racy, and there is only one user (lockd), which has
additional locking that makes the whole thing redundant.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h |  3 +--
 net/sunrpc/clnt.c           | 18 +++---------------
 2 files changed, 4 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 003d8ea70c19..ab3ef6d629a7 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -45,8 +45,7 @@ struct rpc_clnt {
 				cl_intr     : 1,/* interruptible */
 				cl_discrtry : 1,/* disconnect before retry */
 				cl_autobind : 1,/* use getport() */
-				cl_oneshot  : 1,/* dispose after use */
-				cl_dead     : 1;/* abandoned */
+				cl_oneshot  : 1;/* dispose after use */
 
 	struct rpc_rtt *	cl_rtt;		/* RTO estimator data */
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 254a6e1a5770..fb65249538d4 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -286,7 +286,6 @@ rpc_clone_client(struct rpc_clnt *clnt)
 	/* Turn off autobind on clones */
 	new->cl_autobind = 0;
 	new->cl_oneshot = 0;
-	new->cl_dead = 0;
 	INIT_LIST_HEAD(&new->cl_tasks);
 	spin_lock_init(&new->cl_lock);
 	rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval);
@@ -305,9 +304,8 @@ out_no_clnt:
 
 /*
  * Properly shut down an RPC client, terminating all outstanding
- * requests. Note that we must be certain that cl_oneshot and
- * cl_dead are cleared, or else the client would be destroyed
- * when the last task releases it.
+ * requests. Note that we must be certain that cl_oneshot is cleared,
+ * or else the client would be destroyed when the last task releases it.
  */
 int
 rpc_shutdown_client(struct rpc_clnt *clnt)
@@ -318,7 +316,6 @@ rpc_shutdown_client(struct rpc_clnt *clnt)
 	while (!list_empty(&clnt->cl_tasks)) {
 		/* Don't let rpc_release_client destroy us */
 		clnt->cl_oneshot = 0;
-		clnt->cl_dead = 0;
 		rpc_killall_tasks(clnt);
 		wait_event_timeout(destroy_wait,
 			list_empty(&clnt->cl_tasks), 1*HZ);
@@ -369,7 +366,7 @@ rpc_release_client(struct rpc_clnt *clnt)
 
 	if (list_empty(&clnt->cl_tasks))
 		wake_up(&destroy_wait);
-	if (clnt->cl_oneshot || clnt->cl_dead)
+	if (clnt->cl_oneshot)
 		rpc_destroy_client(clnt);
 	kref_put(&clnt->cl_kref, rpc_free_client);
 }
@@ -483,10 +480,6 @@ int rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 	sigset_t	oldset;
 	int		status;
 
-	/* If this client is slain all further I/O fails */
-	if (clnt->cl_dead)
-		return -EIO;
-
 	BUG_ON(flags & RPC_TASK_ASYNC);
 
 	task = rpc_new_task(clnt, flags, &rpc_default_ops, NULL);
@@ -519,11 +512,6 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
 	sigset_t	oldset;
 	int		status;
 
-	/* If this client is slain all further I/O fails */
-	status = -EIO;
-	if (clnt->cl_dead)
-		goto out_release;
-
 	flags |= RPC_TASK_ASYNC;
 
 	/* Create/initialize a new RPC task */
-- 
cgit v1.3-8-gc7d7


From 90c5755ff5111ffdcca10a1e8a823dba29f37b6d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 9 Jun 2007 19:49:36 -0400
Subject: SUNRPC: Kill rpc_clnt->cl_oneshot

Replace it with explicit calls to rpc_shutdown_client() or
rpc_destroy_client() (for the case of asynchronous calls).

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/mon.c              |  2 +-
 fs/nfs/mount_clnt.c         |  4 ++--
 include/linux/sunrpc/clnt.h | 10 ++++------
 net/sunrpc/clnt.c           | 10 +---------
 net/sunrpc/rpcb_clnt.c      |  6 ++++--
 net/sunrpc/sched.c          | 14 ++------------
 6 files changed, 14 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 2102e2d0134d..3353ed8421a7 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -61,6 +61,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 			status);
 	else
 		status = 0;
+	rpc_shutdown_client(clnt);
  out:
 	return status;
 }
@@ -138,7 +139,6 @@ nsm_create(void)
 		.program	= &nsm_program,
 		.version	= SM_VERSION,
 		.authflavor	= RPC_AUTH_NULL,
-		.flags		= (RPC_CLNT_CREATE_ONESHOT),
 	};
 
 	return rpc_create(&args);
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index ca5a266a3140..878d7a5cb6d4 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -69,6 +69,7 @@ nfsroot_mount(struct sockaddr_in *addr, char *path, struct nfs_fh *fh,
 		msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT];
 
 	status = rpc_call_sync(mnt_clnt, &msg, 0);
+	rpc_shutdown_client(mnt_clnt);
 	return status < 0? status : (result.status? -EACCES : 0);
 }
 
@@ -84,8 +85,7 @@ mnt_create(char *hostname, struct sockaddr_in *srvaddr, int version,
 		.program	= &mnt_program,
 		.version	= version,
 		.authflavor	= RPC_AUTH_UNIX,
-		.flags		= (RPC_CLNT_CREATE_ONESHOT |
-				   RPC_CLNT_CREATE_INTR),
+		.flags		= RPC_CLNT_CREATE_INTR,
 	};
 
 	return rpc_create(&args);
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index ab3ef6d629a7..fe7ea65ed0ae 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -44,8 +44,7 @@ struct rpc_clnt {
 	unsigned int		cl_softrtry : 1,/* soft timeouts */
 				cl_intr     : 1,/* interruptible */
 				cl_discrtry : 1,/* disconnect before retry */
-				cl_autobind : 1,/* use getport() */
-				cl_oneshot  : 1;/* dispose after use */
+				cl_autobind : 1;/* use getport() */
 
 	struct rpc_rtt *	cl_rtt;		/* RTO estimator data */
 
@@ -112,10 +111,9 @@ struct rpc_create_args {
 #define RPC_CLNT_CREATE_HARDRTRY	(1UL << 0)
 #define RPC_CLNT_CREATE_INTR		(1UL << 1)
 #define RPC_CLNT_CREATE_AUTOBIND	(1UL << 2)
-#define RPC_CLNT_CREATE_ONESHOT		(1UL << 3)
-#define RPC_CLNT_CREATE_NONPRIVPORT	(1UL << 4)
-#define RPC_CLNT_CREATE_NOPING		(1UL << 5)
-#define RPC_CLNT_CREATE_DISCRTRY	(1UL << 6)
+#define RPC_CLNT_CREATE_NONPRIVPORT	(1UL << 3)
+#define RPC_CLNT_CREATE_NOPING		(1UL << 4)
+#define RPC_CLNT_CREATE_DISCRTRY	(1UL << 5)
 
 struct rpc_clnt *rpc_create(struct rpc_create_args *args);
 struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index fb65249538d4..34662dfa9cc0 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -249,8 +249,6 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 		clnt->cl_intr = 1;
 	if (args->flags & RPC_CLNT_CREATE_AUTOBIND)
 		clnt->cl_autobind = 1;
-	if (args->flags & RPC_CLNT_CREATE_ONESHOT)
-		clnt->cl_oneshot = 1;
 	if (args->flags & RPC_CLNT_CREATE_DISCRTRY)
 		clnt->cl_discrtry = 1;
 
@@ -285,7 +283,6 @@ rpc_clone_client(struct rpc_clnt *clnt)
 	new->cl_xprt = xprt_get(clnt->cl_xprt);
 	/* Turn off autobind on clones */
 	new->cl_autobind = 0;
-	new->cl_oneshot = 0;
 	INIT_LIST_HEAD(&new->cl_tasks);
 	spin_lock_init(&new->cl_lock);
 	rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval);
@@ -304,8 +301,7 @@ out_no_clnt:
 
 /*
  * Properly shut down an RPC client, terminating all outstanding
- * requests. Note that we must be certain that cl_oneshot is cleared,
- * or else the client would be destroyed when the last task releases it.
+ * requests.
  */
 int
 rpc_shutdown_client(struct rpc_clnt *clnt)
@@ -314,8 +310,6 @@ rpc_shutdown_client(struct rpc_clnt *clnt)
 			clnt->cl_protname, clnt->cl_server);
 
 	while (!list_empty(&clnt->cl_tasks)) {
-		/* Don't let rpc_release_client destroy us */
-		clnt->cl_oneshot = 0;
 		rpc_killall_tasks(clnt);
 		wait_event_timeout(destroy_wait,
 			list_empty(&clnt->cl_tasks), 1*HZ);
@@ -366,8 +360,6 @@ rpc_release_client(struct rpc_clnt *clnt)
 
 	if (list_empty(&clnt->cl_tasks))
 		wake_up(&destroy_wait);
-	if (clnt->cl_oneshot)
-		rpc_destroy_client(clnt);
 	kref_put(&clnt->cl_kref, rpc_free_client);
 }
 
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 6c7aa8a1f0c6..00853a326499 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -184,8 +184,7 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
 		.program	= &rpcb_program,
 		.version	= version,
 		.authflavor	= RPC_AUTH_UNIX,
-		.flags		= (RPC_CLNT_CREATE_ONESHOT |
-				   RPC_CLNT_CREATE_NOPING),
+		.flags		= RPC_CLNT_CREATE_NOPING,
 	};
 
 	((struct sockaddr_in *)srvaddr)->sin_port = htons(RPCBIND_PORT);
@@ -238,6 +237,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
 
 	error = rpc_call_sync(rpcb_clnt, &msg, 0);
 
+	rpc_shutdown_client(rpcb_clnt);
 	if (error < 0)
 		printk(KERN_WARNING "RPC: failed to contact local rpcbind "
 				"server (errno %d).\n", -error);
@@ -286,6 +286,7 @@ int rpcb_getport_external(struct sockaddr_in *sin, __u32 prog,
 		return PTR_ERR(rpcb_clnt);
 
 	status = rpc_call_sync(rpcb_clnt, &msg, 0);
+	rpc_shutdown_client(rpcb_clnt);
 
 	if (status >= 0) {
 		if (map.r_port != 0)
@@ -379,6 +380,7 @@ void rpcb_getport(struct rpc_task *task)
 	}
 
 	child = rpc_run_task(rpcb_clnt, RPC_TASK_ASYNC, &rpcb_getport_ops, map);
+	rpc_destroy_client(rpcb_clnt);
 	if (IS_ERR(child)) {
 		status = -EIO;
 		dprintk("RPC: %5u rpcb_getport rpc_run_task failed\n",
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index bb12983580a0..d95fe4e40eb4 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -876,9 +876,7 @@ static void rpc_free_task(struct rcu_head *rcu)
 }
 
 /*
- * Create a new task for the specified client.  We have to
- * clean up after an allocation failure, as the client may
- * have specified "oneshot".
+ * Create a new task for the specified client.
  */
 struct rpc_task *rpc_new_task(struct rpc_clnt *clnt, int flags, const struct rpc_call_ops *tk_ops, void *calldata)
 {
@@ -886,7 +884,7 @@ struct rpc_task *rpc_new_task(struct rpc_clnt *clnt, int flags, const struct rpc
 
 	task = rpc_alloc_task();
 	if (!task)
-		goto cleanup;
+		goto out;
 
 	rpc_init_task(task, clnt, flags, tk_ops, calldata);
 
@@ -894,14 +892,6 @@ struct rpc_task *rpc_new_task(struct rpc_clnt *clnt, int flags, const struct rpc
 	task->tk_flags |= RPC_TASK_DYNAMIC;
 out:
 	return task;
-
-cleanup:
-	/* Check whether to release the client */
-	if (clnt) {
-		kref_get(&clnt->cl_kref); /* pretend we were used ... */
-		rpc_release_client(clnt);
-	}
-	goto out;
 }
 
 
-- 
cgit v1.3-8-gc7d7


From 4c402b40970382ded616eadd544fd63feb76cc79 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 14 Jun 2007 16:40:32 -0400
Subject: SUNRPC: Remove rpc_clnt->cl_count

The kref now does most of what cl_count + cl_user used to do. The only
remaining role for cl_count is to tell us if we are in a 'shutdown'
phase. We can provide that information using a single bit field instead
of a full atomic counter.

Also rename rpc_destroy_client() to rpc_close_client(), which reflects
better what its role is these days.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h |  4 +---
 net/sunrpc/clnt.c           | 19 ++-----------------
 net/sunrpc/rpcb_clnt.c      |  2 +-
 net/sunrpc/sunrpc_syms.c    |  1 -
 4 files changed, 4 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index fe7ea65ed0ae..cf03494c36e7 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -25,7 +25,6 @@ struct rpc_inode;
  */
 struct rpc_clnt {
 	struct kref		cl_kref;	/* Number of references */
-	atomic_t		cl_count;	/* Number of clones */
 	struct list_head	cl_clients;	/* Global list of clients */
 	struct list_head	cl_tasks;	/* List of tasks */
 	spinlock_t		cl_lock;	/* spinlock */
@@ -119,8 +118,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args);
 struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
 				struct rpc_program *, int);
 struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
-int		rpc_shutdown_client(struct rpc_clnt *);
-int		rpc_destroy_client(struct rpc_clnt *);
+void		rpc_shutdown_client(struct rpc_clnt *);
 void		rpc_release_client(struct rpc_clnt *);
 void		rpc_register_client(struct rpc_clnt *);
 void		rpc_unregister_client(struct rpc_clnt *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 613c10e4ac31..be5524d20822 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -121,7 +121,6 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s
 	clnt = kzalloc(sizeof(*clnt), GFP_KERNEL);
 	if (!clnt)
 		goto out_err;
-	atomic_set(&clnt->cl_count, 1);
 	clnt->cl_parent = clnt;
 
 	clnt->cl_server = clnt->cl_inline_name;
@@ -270,7 +269,6 @@ rpc_clone_client(struct rpc_clnt *clnt)
 	new = kmemdup(clnt, sizeof(*new), GFP_KERNEL);
 	if (!new)
 		goto out_no_clnt;
-	atomic_set(&new->cl_count, 1);
 	new->cl_metrics = rpc_alloc_iostats(clnt);
 	if (new->cl_metrics == NULL)
 		goto out_no_stats;
@@ -303,8 +301,7 @@ out_no_clnt:
  * Properly shut down an RPC client, terminating all outstanding
  * requests.
  */
-int
-rpc_shutdown_client(struct rpc_clnt *clnt)
+void rpc_shutdown_client(struct rpc_clnt *clnt)
 {
 	dprintk("RPC:       shutting down %s client for %s\n",
 			clnt->cl_protname, clnt->cl_server);
@@ -315,7 +312,7 @@ rpc_shutdown_client(struct rpc_clnt *clnt)
 			list_empty(&clnt->cl_tasks), 1*HZ);
 	}
 
-	return rpc_destroy_client(clnt);
+	rpc_release_client(clnt);
 }
 
 /*
@@ -363,18 +360,6 @@ rpc_release_client(struct rpc_clnt *clnt)
 	kref_put(&clnt->cl_kref, rpc_free_client);
 }
 
-/*
- * Delete an RPC client
- */
-int
-rpc_destroy_client(struct rpc_clnt *clnt)
-{
-	if (!atomic_dec_and_test(&clnt->cl_count))
-		return 1;
-	kref_put(&clnt->cl_kref, rpc_free_client);
-	return 0;
-}
-
 /**
  * rpc_bind_new_program - bind a new RPC program to an existing client
  * @old - old rpc_client
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 00853a326499..cf7db59cdcde 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -380,7 +380,7 @@ void rpcb_getport(struct rpc_task *task)
 	}
 
 	child = rpc_run_task(rpcb_clnt, RPC_TASK_ASYNC, &rpcb_getport_ops, map);
-	rpc_destroy_client(rpcb_clnt);
+	rpc_release_client(rpcb_clnt);
 	if (IS_ERR(child)) {
 		status = -EIO;
 		dprintk("RPC: %5u rpcb_getport rpc_run_task failed\n",
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index c46d31ca307b..02e83e15fef5 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -35,7 +35,6 @@ EXPORT_SYMBOL(rpc_wake_up_status);
 /* RPC client functions */
 EXPORT_SYMBOL(rpc_clone_client);
 EXPORT_SYMBOL(rpc_bind_new_program);
-EXPORT_SYMBOL(rpc_destroy_client);
 EXPORT_SYMBOL(rpc_shutdown_client);
 EXPORT_SYMBOL(rpc_killall_tasks);
 EXPORT_SYMBOL(rpc_call_sync);
-- 
cgit v1.3-8-gc7d7


From f61534dfd38f895b203e2aadaba04f21a992ca8c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 14 Jun 2007 17:31:58 -0400
Subject: SUNRPC: Remove redundant calls to rpciod_up()/rpciod_down()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svc.c            |  6 ------
 fs/nfs/client.c           | 15 ---------------
 fs/nfsd/nfs4callback.c    | 12 +++---------
 fs/nfsd/nfs4state.c       |  1 -
 include/linux/nfs_fs_sb.h |  1 -
 net/sunrpc/sunrpc_syms.c  |  2 --
 6 files changed, 3 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 126b1bf02c0e..26809325469c 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -123,9 +123,6 @@ lockd(struct svc_rqst *rqstp)
 	/* Process request with signals blocked, but allow SIGKILL.  */
 	allow_signal(SIGKILL);
 
-	/* kick rpciod */
-	rpciod_up();
-
 	dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n");
 
 	if (!nlm_timeout)
@@ -202,9 +199,6 @@ lockd(struct svc_rqst *rqstp)
 	/* Exit the RPC thread */
 	svc_exit_thread(rqstp);
 
-	/* release rpciod */
-	rpciod_down();
-
 	/* Release module */
 	unlock_kernel();
 	module_put_and_exit(0);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 881fa4900923..71d4c4cdac52 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -102,19 +102,10 @@ static struct nfs_client *nfs_alloc_client(const char *hostname,
 					   int nfsversion)
 {
 	struct nfs_client *clp;
-	int error;
 
 	if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
 		goto error_0;
 
-	error = rpciod_up();
-	if (error < 0) {
-		dprintk("%s: couldn't start rpciod! Error = %d\n",
-				__FUNCTION__, error);
-		goto error_1;
-	}
-	__set_bit(NFS_CS_RPCIOD, &clp->cl_res_state);
-
 	if (nfsversion == 4) {
 		if (nfs_callback_up() < 0)
 			goto error_2;
@@ -154,9 +145,6 @@ error_3:
 	if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
 		nfs_callback_down();
 error_2:
-	rpciod_down();
-	__clear_bit(NFS_CS_RPCIOD, &clp->cl_res_state);
-error_1:
 	kfree(clp);
 error_0:
 	return NULL;
@@ -198,9 +186,6 @@ static void nfs_free_client(struct nfs_client *clp)
 	if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
 		nfs_callback_down();
 
-	if (__test_and_clear_bit(NFS_CS_RPCIOD, &clp->cl_res_state))
-		rpciod_down();
-
 	kfree(clp->cl_hostname);
 	kfree(clp);
 
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 864090edc28b..6b1b487db1ec 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -429,29 +429,23 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 		goto out_err;
 	}
 
-	/* Kick rpciod, put the call on the wire. */
-	if (rpciod_up() != 0)
-		goto out_clnt;
-
 	/* the task holds a reference to the nfs4_client struct */
 	atomic_inc(&clp->cl_count);
 
 	msg.rpc_cred = nfsd4_lookupcred(clp,0);
 	if (IS_ERR(msg.rpc_cred))
-		goto out_rpciod;
+		goto out_release_clp;
 	status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_ASYNC, &nfs4_cb_null_ops, NULL);
 	put_rpccred(msg.rpc_cred);
 
 	if (status != 0) {
 		dprintk("NFSD: asynchronous NFSPROC4_CB_NULL failed!\n");
-		goto out_rpciod;
+		goto out_release_clp;
 	}
 	return;
 
-out_rpciod:
+out_release_clp:
 	atomic_dec(&clp->cl_count);
-	rpciod_down();
-out_clnt:
 	rpc_shutdown_client(cb->cb_client);
 out_err:
 	cb->cb_client = NULL;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3cc8ce422ab1..8c52913d7cb6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -378,7 +378,6 @@ shutdown_callback_client(struct nfs4_client *clp)
 	if (clnt) {
 		clp->cl_callback.cb_client = NULL;
 		rpc_shutdown_client(clnt);
-		rpciod_down();
 	}
 }
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 52b4378311c8..144d955dc46a 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -16,7 +16,6 @@ struct nfs_client {
 #define NFS_CS_INITING		1		/* busy initialising */
 	int			cl_nfsversion;	/* NFS protocol version */
 	unsigned long		cl_res_state;	/* NFS resources state */
-#define NFS_CS_RPCIOD		0		/* - rpciod started */
 #define NFS_CS_CALLBACK		1		/* - callback started */
 #define NFS_CS_IDMAP		2		/* - idmap started */
 #define NFS_CS_RENEWD		3		/* - renewd started */
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 02e83e15fef5..b99b11b11461 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -28,8 +28,6 @@ EXPORT_SYMBOL(rpc_init_task);
 EXPORT_SYMBOL(rpc_sleep_on);
 EXPORT_SYMBOL(rpc_wake_up_next);
 EXPORT_SYMBOL(rpc_wake_up_task);
-EXPORT_SYMBOL(rpciod_down);
-EXPORT_SYMBOL(rpciod_up);
 EXPORT_SYMBOL(rpc_wake_up_status);
 
 /* RPC client functions */
-- 
cgit v1.3-8-gc7d7


From 188fef11db219f13f32d055ba59985e7d1a349fe Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 16 Jun 2007 14:18:40 -0400
Subject: SUNRPC: Move rpc_register_client and friends into net/sunrpc/clnt.c

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h |  2 --
 net/sunrpc/clnt.c           | 57 +++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/sched.c          | 62 ---------------------------------------------
 3 files changed, 57 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index cf03494c36e7..a451351c7eff 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -120,8 +120,6 @@ struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
 struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
 void		rpc_shutdown_client(struct rpc_clnt *);
 void		rpc_release_client(struct rpc_clnt *);
-void		rpc_register_client(struct rpc_clnt *);
-void		rpc_unregister_client(struct rpc_clnt *);
 int		rpcb_register(u32, u32, int, unsigned short, int *);
 void		rpcb_getport(struct rpc_task *);
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index fe838e996ee3..4f39ab1b04d6 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -44,6 +44,12 @@
 	dprintk("RPC: %5u %s (status %d)\n", t->tk_pid,		\
 			__FUNCTION__, t->tk_status)
 
+/*
+ * All RPC clients are linked into this list
+ */
+static LIST_HEAD(all_clients);
+static DEFINE_SPINLOCK(rpc_client_lock);
+
 static DECLARE_WAIT_QUEUE_HEAD(destroy_wait);
 
 
@@ -66,6 +72,19 @@ static void	call_connect_status(struct rpc_task *task);
 static __be32 *	call_header(struct rpc_task *task);
 static __be32 *	call_verify(struct rpc_task *task);
 
+static void rpc_register_client(struct rpc_clnt *clnt)
+{
+	spin_lock(&rpc_client_lock);
+	list_add(&clnt->cl_clients, &all_clients);
+	spin_unlock(&rpc_client_lock);
+}
+
+static void rpc_unregister_client(struct rpc_clnt *clnt)
+{
+	spin_lock(&rpc_client_lock);
+	list_del(&clnt->cl_clients);
+	spin_unlock(&rpc_client_lock);
+}
 
 static int
 rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
@@ -1410,3 +1429,41 @@ int rpc_ping(struct rpc_clnt *clnt, int flags)
 	put_rpccred(msg.rpc_cred);
 	return err;
 }
+
+#ifdef RPC_DEBUG
+void rpc_show_tasks(void)
+{
+	struct rpc_clnt *clnt;
+	struct rpc_task *t;
+
+	spin_lock(&rpc_client_lock);
+	if (list_empty(&all_clients))
+		goto out;
+	printk("-pid- proc flgs status -client- -prog- --rqstp- -timeout "
+		"-rpcwait -action- ---ops--\n");
+	list_for_each_entry(clnt, &all_clients, cl_clients) {
+		if (list_empty(&clnt->cl_tasks))
+			continue;
+		spin_lock(&clnt->cl_lock);
+		list_for_each_entry(t, &clnt->cl_tasks, tk_task) {
+			const char *rpc_waitq = "none";
+
+			if (RPC_IS_QUEUED(t))
+				rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq);
+
+			printk("%5u %04d %04x %6d %8p %6d %8p %8ld %8s %8p %8p\n",
+				t->tk_pid,
+				(t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1),
+				t->tk_flags, t->tk_status,
+				t->tk_client,
+				(t->tk_client ? t->tk_client->cl_prog : 0),
+				t->tk_rqstp, t->tk_timeout,
+				rpc_waitq,
+				t->tk_action, t->tk_ops);
+		}
+		spin_unlock(&clnt->cl_lock);
+	}
+out:
+	spin_unlock(&rpc_client_lock);
+}
+#endif
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 05825154ddd9..c0f8d25caf57 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -47,11 +47,6 @@ static void			 rpc_release_task(struct rpc_task *task);
  */
 static RPC_WAITQ(delay_queue, "delayq");
 
-/*
- * All RPC clients are linked into this list
- */
-static LIST_HEAD(all_clients);
-
 /*
  * rpciod-related stuff
  */
@@ -59,11 +54,6 @@ static DEFINE_MUTEX(rpciod_mutex);
 static atomic_t rpciod_users = ATOMIC_INIT(0);
 struct workqueue_struct *rpciod_workqueue;
 
-/*
- * Spinlock for other critical sections of code.
- */
-static DEFINE_SPINLOCK(rpc_sched_lock);
-
 /*
  * Disable the timer for a given RPC task. Should be called with
  * queue->lock and bh_disabled in order to avoid races within
@@ -994,20 +984,6 @@ void rpc_killall_tasks(struct rpc_clnt *clnt)
 	spin_unlock(&clnt->cl_lock);
 }
 
-void rpc_register_client(struct rpc_clnt *clnt)
-{
-	spin_lock(&rpc_sched_lock);
-	list_add(&clnt->cl_clients, &all_clients);
-	spin_unlock(&rpc_sched_lock);
-}
-
-void rpc_unregister_client(struct rpc_clnt *clnt)
-{
-	spin_lock(&rpc_sched_lock);
-	list_del(&clnt->cl_clients);
-	spin_unlock(&rpc_sched_lock);
-}
-
 /*
  * Start up the rpciod process if it's not already running.
  */
@@ -1059,44 +1035,6 @@ rpciod_down(void)
 	mutex_unlock(&rpciod_mutex);
 }
 
-#ifdef RPC_DEBUG
-void rpc_show_tasks(void)
-{
-	struct rpc_clnt *clnt;
-	struct rpc_task *t;
-
-	spin_lock(&rpc_sched_lock);
-	if (list_empty(&all_clients))
-		goto out;
-	printk("-pid- proc flgs status -client- -prog- --rqstp- -timeout "
-		"-rpcwait -action- ---ops--\n");
-	list_for_each_entry(clnt, &all_clients, cl_clients) {
-		if (list_empty(&clnt->cl_tasks))
-			continue;
-		spin_lock(&clnt->cl_lock);
-		list_for_each_entry(t, &clnt->cl_tasks, tk_task) {
-			const char *rpc_waitq = "none";
-
-			if (RPC_IS_QUEUED(t))
-				rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq);
-
-			printk("%5u %04d %04x %6d %8p %6d %8p %8ld %8s %8p %8p\n",
-				t->tk_pid,
-				(t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1),
-				t->tk_flags, t->tk_status,
-				t->tk_client,
-				(t->tk_client ? t->tk_client->cl_prog : 0),
-				t->tk_rqstp, t->tk_timeout,
-				rpc_waitq,
-				t->tk_action, t->tk_ops);
-		}
-		spin_unlock(&clnt->cl_lock);
-	}
-out:
-	spin_unlock(&rpc_sched_lock);
-}
-#endif
-
 void
 rpc_destroy_mempool(void)
 {
-- 
cgit v1.3-8-gc7d7


From 4a8c1344dccb848dbcf0edabc8b5c51a8ecf2808 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 7 Jun 2007 10:14:14 -0400
Subject: SUNRPC: Add a backpointer from the struct rpc_cred to the rpc_auth

Cleans up an issue whereby rpcsec_gss uses the rpc_clnt->cl_auth. If we want
to be able to add several rpc_auths to a single rpc_clnt, then this abuse
must go.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h    | 4 ++++
 net/sunrpc/auth_gss/auth_gss.c | 3 ++-
 net/sunrpc/auth_null.c         | 1 +
 net/sunrpc/auth_unix.c         | 1 +
 4 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 534cdc7be58d..8ef27afeea73 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -30,8 +30,11 @@ struct auth_cred {
 /*
  * Client user credentials
  */
+struct rpc_auth;
+struct rpc_credops;
 struct rpc_cred {
 	struct hlist_node	cr_hash;	/* hash chain */
+	struct rpc_auth *	cr_auth;
 	struct rpc_credops *	cr_ops;
 	unsigned long		cr_expire;	/* when to gc */
 	atomic_t		cr_count;	/* ref count */
@@ -60,6 +63,7 @@ struct rpc_cred_cache {
 	unsigned long		expire;		/* cache expiry interval */
 };
 
+struct rpc_authops;
 struct rpc_auth {
 	unsigned int		au_cslack;	/* call cred size estimate */
 				/* guess at number of u32's auth adds before
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 4e4ccc5b6fea..e894e2fc360d 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -386,7 +386,7 @@ static inline int
 gss_refresh_upcall(struct rpc_task *task)
 {
 	struct rpc_cred *cred = task->tk_msg.rpc_cred;
-	struct gss_auth *gss_auth = container_of(task->tk_client->cl_auth,
+	struct gss_auth *gss_auth = container_of(cred->cr_auth,
 			struct gss_auth, rpc_auth);
 	struct gss_cred *gss_cred = container_of(cred,
 			struct gss_cred, gc_base);
@@ -741,6 +741,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 	 * fail to flag the credential as RPCAUTH_CRED_UPTODATE.
 	 */
 	cred->gc_flags = 0;
+	cred->gc_base.cr_auth = auth;
 	cred->gc_base.cr_ops = &gss_credops;
 	cred->gc_base.cr_flags = RPCAUTH_CRED_NEW;
 	cred->gc_service = gss_auth->service;
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 3df9fccab2f8..890bd9b3794b 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -133,6 +133,7 @@ struct rpc_credops	null_credops = {
 
 static
 struct rpc_cred null_cred = {
+	.cr_auth	= &null_auth,
 	.cr_ops		= &null_credops,
 	.cr_count	= ATOMIC_INIT(1),
 	.cr_flags	= RPCAUTH_CRED_UPTODATE,
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 4e7733aee36e..82300b83045e 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -92,6 +92,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 		if (i < NFS_NGROUPS)
 		  cred->uc_gids[i] = NOGROUP;
 	}
+	cred->uc_base.cr_auth = &unix_auth;
 	cred->uc_base.cr_ops = &unix_credops;
 
 	return (struct rpc_cred *) cred;
-- 
cgit v1.3-8-gc7d7


From 6e84c7b66a0aa0be16a7728d1e687c57978dac2c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 7 Jun 2007 15:31:36 -0400
Subject: SUNRPC: Add a downcall queue to struct rpc_inode

Currently, the downcall queue is tied to the struct gss_auth, which means
that different RPCSEC_GSS pseudoflavours must use different upcall pipes.
Add a list to struct rpc_inode that can be used instead.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/rpc_pipe_fs.h |  1 +
 net/sunrpc/auth_gss/auth_gss.c     | 29 ++++++++++-------------------
 net/sunrpc/rpc_pipe.c              |  1 +
 3 files changed, 12 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index ad293760f6eb..430cea104817 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -23,6 +23,7 @@ struct rpc_inode {
 	void *private;
 	struct list_head pipe;
 	struct list_head in_upcall;
+	struct list_head in_downcall;
 	int pipelen;
 	int nreaders;
 	int nwriters;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index e407a352440f..50809086fa1b 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -85,7 +85,6 @@ struct gss_auth {
 	struct rpc_auth rpc_auth;
 	struct gss_api_mech *mech;
 	enum rpc_gss_svc service;
-	struct list_head upcalls;
 	struct rpc_clnt *client;
 	struct dentry *dentry;
 };
@@ -268,10 +267,10 @@ gss_release_msg(struct gss_upcall_msg *gss_msg)
 }
 
 static struct gss_upcall_msg *
-__gss_find_upcall(struct gss_auth *gss_auth, uid_t uid)
+__gss_find_upcall(struct rpc_inode *rpci, uid_t uid)
 {
 	struct gss_upcall_msg *pos;
-	list_for_each_entry(pos, &gss_auth->upcalls, list) {
+	list_for_each_entry(pos, &rpci->in_downcall, list) {
 		if (pos->uid != uid)
 			continue;
 		atomic_inc(&pos->count);
@@ -290,13 +289,14 @@ static inline struct gss_upcall_msg *
 gss_add_msg(struct gss_auth *gss_auth, struct gss_upcall_msg *gss_msg)
 {
 	struct inode *inode = gss_auth->dentry->d_inode;
+	struct rpc_inode *rpci = RPC_I(inode);
 	struct gss_upcall_msg *old;
 
 	spin_lock(&inode->i_lock);
-	old = __gss_find_upcall(gss_auth, gss_msg->uid);
+	old = __gss_find_upcall(rpci, gss_msg->uid);
 	if (old == NULL) {
 		atomic_inc(&gss_msg->count);
-		list_add(&gss_msg->list, &gss_auth->upcalls);
+		list_add(&gss_msg->list, &rpci->in_downcall);
 	} else
 		gss_msg = old;
 	spin_unlock(&inode->i_lock);
@@ -493,7 +493,6 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	const void *p, *end;
 	void *buf;
 	struct rpc_clnt *clnt;
-	struct gss_auth *gss_auth;
 	struct gss_upcall_msg *gss_msg;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct gss_cl_ctx *ctx;
@@ -526,9 +525,8 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 
 	err = -ENOENT;
 	/* Find a matching upcall */
-	gss_auth = container_of(clnt->cl_auth, struct gss_auth, rpc_auth);
 	spin_lock(&inode->i_lock);
-	gss_msg = __gss_find_upcall(gss_auth, uid);
+	gss_msg = __gss_find_upcall(RPC_I(inode), uid);
 	if (gss_msg == NULL) {
 		spin_unlock(&inode->i_lock);
 		goto err_put_ctx;
@@ -536,7 +534,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	list_del_init(&gss_msg->list);
 	spin_unlock(&inode->i_lock);
 
-	p = gss_fill_context(p, end, ctx, gss_auth->mech);
+	p = gss_fill_context(p, end, ctx, gss_msg->auth->mech);
 	if (IS_ERR(p)) {
 		err = PTR_ERR(p);
 		gss_msg->msg.errno = (err == -EACCES) ? -EACCES : -EAGAIN;
@@ -563,18 +561,12 @@ static void
 gss_pipe_release(struct inode *inode)
 {
 	struct rpc_inode *rpci = RPC_I(inode);
-	struct rpc_clnt *clnt;
-	struct rpc_auth *auth;
-	struct gss_auth *gss_auth;
+	struct gss_upcall_msg *gss_msg;
 
-	clnt = rpci->private;
-	auth = clnt->cl_auth;
-	gss_auth = container_of(auth, struct gss_auth, rpc_auth);
 	spin_lock(&inode->i_lock);
-	while (!list_empty(&gss_auth->upcalls)) {
-		struct gss_upcall_msg *gss_msg;
+	while (!list_empty(&rpci->in_downcall)) {
 
-		gss_msg = list_entry(gss_auth->upcalls.next,
+		gss_msg = list_entry(rpci->in_downcall.next,
 				struct gss_upcall_msg, list);
 		gss_msg->msg.errno = -EPIPE;
 		atomic_inc(&gss_msg->count);
@@ -637,7 +629,6 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
 	gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor);
 	if (gss_auth->service == 0)
 		goto err_put_mech;
-	INIT_LIST_HEAD(&gss_auth->upcalls);
 	auth = &gss_auth->rpc_auth;
 	auth->au_cslack = GSS_CRED_SLACK >> 2;
 	auth->au_rslack = GSS_VERF_SLACK >> 2;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 826190dacfce..2320f1e42da4 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -833,6 +833,7 @@ init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 	rpci->nreaders = 0;
 	rpci->nwriters = 0;
 	INIT_LIST_HEAD(&rpci->in_upcall);
+	INIT_LIST_HEAD(&rpci->in_downcall);
 	INIT_LIST_HEAD(&rpci->pipe);
 	rpci->pipelen = 0;
 	init_waitqueue_head(&rpci->waitq);
-- 
cgit v1.3-8-gc7d7


From 03a1256f06cf1f58e33971fb4a524479e75c200e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 8 Jun 2007 14:14:53 -0400
Subject: SUNRPC: Add a field to track the number of kernel users of an
 rpc_pipe

This allows us to correctly deduce when we need to remove the pipe.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/rpc_pipe_fs.h |  1 +
 net/sunrpc/rpc_pipe.c              | 12 ++++++++----
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index 430cea104817..51b977a4ca20 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -27,6 +27,7 @@ struct rpc_inode {
 	int pipelen;
 	int nreaders;
 	int nwriters;
+	int nkern_readwriters;
 	wait_queue_head_t waitq;
 #define RPC_PIPE_WAIT_FOR_OPEN	1
 	int flags;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index e5fd796e897e..e787b6a43eee 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -737,6 +737,7 @@ rpc_mkpipe(struct dentry *parent, const char *name, void *private, struct rpc_pi
 			dput (dentry);
 			dentry = ERR_PTR(-EBUSY);
 		}
+		rpci->nkern_readwriters++;
 		goto out;
 	}
 	inode = rpc_get_inode(dir->i_sb, S_IFIFO | S_IRUSR | S_IWUSR);
@@ -749,6 +750,7 @@ rpc_mkpipe(struct dentry *parent, const char *name, void *private, struct rpc_pi
 	rpci->private = private;
 	rpci->flags = flags;
 	rpci->ops = ops;
+	rpci->nkern_readwriters = 1;
 	inode_dir_notify(dir, DN_CREATE);
 	dget(dentry);
 out:
@@ -773,10 +775,12 @@ rpc_unlink(struct dentry *dentry)
 	parent = dget_parent(dentry);
 	dir = parent->d_inode;
 	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
-	rpc_close_pipes(dentry->d_inode);
-	error = simple_unlink(dir, dentry);
-	if (!error)
-		d_delete(dentry);
+	if (--RPC_I(dentry->d_inode)->nkern_readwriters == 0) {
+		rpc_close_pipes(dentry->d_inode);
+		error = simple_unlink(dir, dentry);
+		if (!error)
+			d_delete(dentry);
+	}
 	dput(dentry);
 	mutex_unlock(&dir->i_mutex);
 	dput(parent);
-- 
cgit v1.3-8-gc7d7


From 3ab9bb7243489f9db3abf3d05521ddfc6b184c0a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 9 Jun 2007 15:41:42 -0400
Subject: SUNRPC: Fix a memory leak in the auth credcache code

The leak only affects the RPCSEC_GSS caches, since they are the only ones
that are dynamically allocated...
Rename the existing rpcauth_free_credcache() to rpcauth_clear_credcache()
in order to better describe its role, then add a new function
rpcauth_destroy_credcache() that actually frees the cache in addition to
clearing it out.

Also move the call to destroy the credcache in gss_destroy() to come before
the rpc upcall pipe is unlinked.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h    |  3 ++-
 net/sunrpc/auth.c              | 18 ++++++++++++++++--
 net/sunrpc/auth_gss/auth_gss.c |  3 ++-
 net/sunrpc/auth_unix.c         |  2 +-
 net/sunrpc/sunrpc_syms.c       |  2 +-
 5 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 8ef27afeea73..3972b8414c88 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -143,7 +143,8 @@ int			rpcauth_refreshcred(struct rpc_task *);
 void			rpcauth_invalcred(struct rpc_task *);
 int			rpcauth_uptodatecred(struct rpc_task *);
 int			rpcauth_init_credcache(struct rpc_auth *, unsigned long);
-void			rpcauth_free_credcache(struct rpc_auth *);
+void			rpcauth_destroy_credcache(struct rpc_auth *);
+void			rpcauth_clear_credcache(struct rpc_cred_cache *);
 
 static inline
 struct rpc_cred *	get_rpccred(struct rpc_cred *cred)
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 9527f2bb1744..f6b6c81cbc3e 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -137,9 +137,8 @@ void rpcauth_destroy_credlist(struct hlist_head *head)
  * that are not referenced.
  */
 void
-rpcauth_free_credcache(struct rpc_auth *auth)
+rpcauth_clear_credcache(struct rpc_cred_cache *cache)
 {
-	struct rpc_cred_cache *cache = auth->au_credcache;
 	HLIST_HEAD(free);
 	struct hlist_node *pos, *next;
 	struct rpc_cred	*cred;
@@ -157,6 +156,21 @@ rpcauth_free_credcache(struct rpc_auth *auth)
 	rpcauth_destroy_credlist(&free);
 }
 
+/*
+ * Destroy the RPC credential cache
+ */
+void
+rpcauth_destroy_credcache(struct rpc_auth *auth)
+{
+	struct rpc_cred_cache *cache = auth->au_credcache;
+
+	if (cache) {
+		auth->au_credcache = NULL;
+		rpcauth_clear_credcache(cache);
+		kfree(cache);
+	}
+}
+
 static void
 rpcauth_prune_expired(struct rpc_auth *auth, struct rpc_cred *cred, struct hlist_head *free)
 {
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 50809086fa1b..8b4c02f8befb 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -665,12 +665,13 @@ gss_destroy(struct rpc_auth *auth)
 	dprintk("RPC:       destroying GSS authenticator %p flavor %d\n",
 			auth, auth->au_flavor);
 
+	rpcauth_destroy_credcache(auth);
+
 	gss_auth = container_of(auth, struct gss_auth, rpc_auth);
 	rpc_unlink(gss_auth->dentry);
 	gss_auth->dentry = NULL;
 	gss_mech_put(gss_auth->mech);
 
-	rpcauth_free_credcache(auth);
 	kfree(gss_auth);
 	module_put(THIS_MODULE);
 }
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 82300b83045e..5622783011a4 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -50,7 +50,7 @@ static void
 unx_destroy(struct rpc_auth *auth)
 {
 	dprintk("RPC:       destroying UNIX authenticator %p\n", auth);
-	rpcauth_free_credcache(auth);
+	rpcauth_clear_credcache(auth->au_credcache);
 }
 
 /*
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index b99b11b11461..3e19e7af6799 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -57,7 +57,7 @@ EXPORT_SYMBOL(rpcauth_unregister);
 EXPORT_SYMBOL(rpcauth_create);
 EXPORT_SYMBOL(rpcauth_lookupcred);
 EXPORT_SYMBOL(rpcauth_lookup_credcache);
-EXPORT_SYMBOL(rpcauth_free_credcache);
+EXPORT_SYMBOL(rpcauth_destroy_credcache);
 EXPORT_SYMBOL(rpcauth_init_credcache);
 EXPORT_SYMBOL(put_rpccred);
 
-- 
cgit v1.3-8-gc7d7


From 64c91a1f1c8bc4295fd6b90df8adf911a7dd64f4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 23 Jun 2007 10:17:16 -0400
Subject: SUNRPC: Make rpc_ping() static

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h | 1 -
 net/sunrpc/clnt.c           | 4 +++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index a451351c7eff..a0e51e193284 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -136,7 +136,6 @@ void		rpc_clnt_sigunmask(struct rpc_clnt *clnt, sigset_t *oldset);
 void		rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
 size_t		rpc_max_payload(struct rpc_clnt *);
 void		rpc_force_rebind(struct rpc_clnt *);
-int		rpc_ping(struct rpc_clnt *clnt, int flags);
 size_t		rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t);
 char *		rpc_peeraddr2str(struct rpc_clnt *, enum rpc_display_format_t);
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 76eef19cf995..4e91f3110938 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -72,6 +72,8 @@ static void	call_connect_status(struct rpc_task *task);
 static __be32 *	call_header(struct rpc_task *task);
 static __be32 *	call_verify(struct rpc_task *task);
 
+static int	rpc_ping(struct rpc_clnt *clnt, int flags);
+
 static void rpc_register_client(struct rpc_clnt *clnt)
 {
 	spin_lock(&rpc_client_lock);
@@ -1441,7 +1443,7 @@ static struct rpc_procinfo rpcproc_null = {
 	.p_decode = rpcproc_decode_null,
 };
 
-int rpc_ping(struct rpc_clnt *clnt, int flags)
+static int rpc_ping(struct rpc_clnt *clnt, int flags)
 {
 	struct rpc_message msg = {
 		.rpc_proc = &rpcproc_null,
-- 
cgit v1.3-8-gc7d7


From 5e1550d6a2c2dd33ff0ca5febefd8e9c65c6ca1e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 23 Jun 2007 10:17:16 -0400
Subject: SUNRPC: Add the helper function 'rpc_call_null()'

Does a NULL RPC call and returns a pointer to the resulting rpc_task. The
call may be either synchronous or asynchronous.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h |  2 ++
 net/sunrpc/clnt.c           | 10 ++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index a0e51e193284..097984b03857 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -130,6 +130,8 @@ int		rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg,
 			       void *calldata);
 int		rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg,
 			      int flags);
+struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred,
+			       int flags);
 void		rpc_restart_call(struct rpc_task *);
 void		rpc_clnt_sigmask(struct rpc_clnt *clnt, sigset_t *oldset);
 void		rpc_clnt_sigunmask(struct rpc_clnt *clnt, sigset_t *oldset);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 4e91f3110938..5a28ffac99ea 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1455,6 +1455,16 @@ static int rpc_ping(struct rpc_clnt *clnt, int flags)
 	return err;
 }
 
+struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int flags)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &rpcproc_null,
+		.rpc_cred = cred,
+	};
+	return rpc_do_run_task(clnt, &msg, flags, &rpc_default_ops, NULL);
+}
+EXPORT_SYMBOL(rpc_call_null);
+
 #ifdef RPC_DEBUG
 void rpc_show_tasks(void)
 {
-- 
cgit v1.3-8-gc7d7


From de7a8ce38aea529876db3890b61947bc4bc004da Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 23 Jun 2007 10:46:47 -0400
Subject: SUNRPC: Rename rpcauth_destroy() to rpcauth_release()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h | 2 +-
 net/sunrpc/auth.c           | 4 ++--
 net/sunrpc/clnt.c           | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 3972b8414c88..bc77c730325c 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -128,7 +128,7 @@ extern struct rpc_authops	authdes_ops;
 int			rpcauth_register(struct rpc_authops *);
 int			rpcauth_unregister(struct rpc_authops *);
 struct rpc_auth *	rpcauth_create(rpc_authflavor_t, struct rpc_clnt *);
-void			rpcauth_destroy(struct rpc_auth *);
+void			rpcauth_release(struct rpc_auth *);
 struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int);
 struct rpc_cred *	rpcauth_lookupcred(struct rpc_auth *, int);
 struct rpc_cred *	rpcauth_bindcred(struct rpc_task *);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 584f24311a80..1686dc74c6a9 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -93,7 +93,7 @@ rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt)
 	if (IS_ERR(auth))
 		return auth;
 	if (clnt->cl_auth)
-		rpcauth_destroy(clnt->cl_auth);
+		rpcauth_release(clnt->cl_auth);
 	clnt->cl_auth = auth;
 
 out:
@@ -101,7 +101,7 @@ out:
 }
 
 void
-rpcauth_destroy(struct rpc_auth *auth)
+rpcauth_release(struct rpc_auth *auth)
 {
 	if (!atomic_dec_and_test(&auth->au_count))
 		return;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 5a28ffac99ea..98df44e453fe 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -353,7 +353,7 @@ rpc_free_client(struct kref *kref)
 	dprintk("RPC:       destroying %s client for %s\n",
 			clnt->cl_protname, clnt->cl_server);
 	if (clnt->cl_auth) {
-		rpcauth_destroy(clnt->cl_auth);
+		rpcauth_release(clnt->cl_auth);
 		clnt->cl_auth = NULL;
 	}
 	if (!IS_ERR(clnt->cl_dentry)) {
-- 
cgit v1.3-8-gc7d7


From f1c0a8615090359d57e096157feb9f900cbb233c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 23 Jun 2007 20:17:58 -0400
Subject: SUNRPC: Mark auth and cred operation tables as constant.

Also do the same for gss_api operation tables.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h          | 15 ++++++---------
 include/linux/sunrpc/gss_api.h       |  2 +-
 net/sunrpc/auth.c                    |  8 ++++----
 net/sunrpc/auth_gss/auth_gss.c       |  8 ++++----
 net/sunrpc/auth_gss/gss_krb5_mech.c  |  2 +-
 net/sunrpc/auth_gss/gss_spkm3_mech.c |  2 +-
 net/sunrpc/auth_null.c               |  4 ++--
 net/sunrpc/auth_unix.c               |  6 +++---
 8 files changed, 22 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index bc77c730325c..e606c2804685 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -35,7 +35,7 @@ struct rpc_credops;
 struct rpc_cred {
 	struct hlist_node	cr_hash;	/* hash chain */
 	struct rpc_auth *	cr_auth;
-	struct rpc_credops *	cr_ops;
+	const struct rpc_credops *cr_ops;
 	unsigned long		cr_expire;	/* when to gc */
 	atomic_t		cr_count;	/* ref count */
 	unsigned short		cr_flags;	/* various flags */
@@ -73,7 +73,7 @@ struct rpc_auth {
 	unsigned int		au_verfsize;
 
 	unsigned int		au_flags;	/* various flags */
-	struct rpc_authops *	au_ops;		/* operations */
+	const struct rpc_authops *au_ops;		/* operations */
 	rpc_authflavor_t	au_flavor;	/* pseudoflavor (note may
 						 * differ from the flavor in
 						 * au_ops->au_flavor in gss
@@ -119,14 +119,11 @@ struct rpc_credops {
 						void *, __be32 *, void *);
 };
 
-extern struct rpc_authops	authunix_ops;
-extern struct rpc_authops	authnull_ops;
-#ifdef CONFIG_SUNRPC_SECURE
-extern struct rpc_authops	authdes_ops;
-#endif
+extern const struct rpc_authops	authunix_ops;
+extern const struct rpc_authops	authnull_ops;
 
-int			rpcauth_register(struct rpc_authops *);
-int			rpcauth_unregister(struct rpc_authops *);
+int			rpcauth_register(const struct rpc_authops *);
+int			rpcauth_unregister(const struct rpc_authops *);
 struct rpc_auth *	rpcauth_create(rpc_authflavor_t, struct rpc_clnt *);
 void			rpcauth_release(struct rpc_auth *);
 struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int);
diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h
index 5eca9e442051..bbac101ac372 100644
--- a/include/linux/sunrpc/gss_api.h
+++ b/include/linux/sunrpc/gss_api.h
@@ -77,7 +77,7 @@ struct gss_api_mech {
 	struct module		*gm_owner;
 	struct xdr_netobj	gm_oid;
 	char			*gm_name;
-	struct gss_api_ops	*gm_ops;
+	const struct gss_api_ops *gm_ops;
 	/* pseudoflavors supported by this mechanism: */
 	int			gm_pf_num;
 	struct pf_desc *	gm_pfs;
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 1686dc74c6a9..d3f0f944c0b5 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -19,7 +19,7 @@
 #endif
 
 static DEFINE_SPINLOCK(rpc_authflavor_lock);
-static struct rpc_authops *	auth_flavors[RPC_AUTH_MAXFLAVOR] = {
+static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
 	&authnull_ops,		/* AUTH_NULL */
 	&authunix_ops,		/* AUTH_UNIX */
 	NULL,			/* others can be loadable modules */
@@ -33,7 +33,7 @@ pseudoflavor_to_flavor(u32 flavor) {
 }
 
 int
-rpcauth_register(struct rpc_authops *ops)
+rpcauth_register(const struct rpc_authops *ops)
 {
 	rpc_authflavor_t flavor;
 	int ret = -EPERM;
@@ -50,7 +50,7 @@ rpcauth_register(struct rpc_authops *ops)
 }
 
 int
-rpcauth_unregister(struct rpc_authops *ops)
+rpcauth_unregister(const struct rpc_authops *ops)
 {
 	rpc_authflavor_t flavor;
 	int ret = -EPERM;
@@ -70,7 +70,7 @@ struct rpc_auth *
 rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt)
 {
 	struct rpc_auth		*auth;
-	struct rpc_authops	*ops;
+	const struct rpc_authops *ops;
 	u32			flavor = pseudoflavor_to_flavor(pseudoflavor);
 
 	auth = ERR_PTR(-EINVAL);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 459dc9b1d1ad..177a9e413c0a 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -54,9 +54,9 @@
 #include <linux/sunrpc/gss_api.h>
 #include <asm/uaccess.h>
 
-static struct rpc_authops authgss_ops;
+static const struct rpc_authops authgss_ops;
 
-static struct rpc_credops gss_credops;
+static const struct rpc_credops gss_credops;
 
 #ifdef RPC_DEBUG
 # define RPCDBG_FACILITY	RPCDBG_AUTH
@@ -1193,7 +1193,7 @@ out:
 	return status;
 }
 
-static struct rpc_authops authgss_ops = {
+static const struct rpc_authops authgss_ops = {
 	.owner		= THIS_MODULE,
 	.au_flavor	= RPC_AUTH_GSS,
 #ifdef RPC_DEBUG
@@ -1205,7 +1205,7 @@ static struct rpc_authops authgss_ops = {
 	.crcreate	= gss_create_cred
 };
 
-static struct rpc_credops gss_credops = {
+static const struct rpc_credops gss_credops = {
 	.cr_name	= "AUTH_GSS",
 	.crdestroy	= gss_destroy_cred,
 	.cr_init	= gss_cred_init,
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 7b1943217053..71b9daefdff3 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -201,7 +201,7 @@ gss_delete_sec_context_kerberos(void *internal_ctx) {
 	kfree(kctx);
 }
 
-static struct gss_api_ops gss_kerberos_ops = {
+static const struct gss_api_ops gss_kerberos_ops = {
 	.gss_import_sec_context	= gss_import_sec_context_kerberos,
 	.gss_get_mic		= gss_get_mic_kerberos,
 	.gss_verify_mic		= gss_verify_mic_kerberos,
diff --git a/net/sunrpc/auth_gss/gss_spkm3_mech.c b/net/sunrpc/auth_gss/gss_spkm3_mech.c
index 7e15aa68ae64..577d590e755f 100644
--- a/net/sunrpc/auth_gss/gss_spkm3_mech.c
+++ b/net/sunrpc/auth_gss/gss_spkm3_mech.c
@@ -202,7 +202,7 @@ gss_get_mic_spkm3(struct gss_ctx	*ctx,
 	return err;
 }
 
-static struct gss_api_ops gss_spkm3_ops = {
+static const struct gss_api_ops gss_spkm3_ops = {
 	.gss_import_sec_context	= gss_import_sec_context_spkm3,
 	.gss_get_mic		= gss_get_mic_spkm3,
 	.gss_verify_mic		= gss_verify_mic_spkm3,
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 890bd9b3794b..fe9b6aaf91eb 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -101,7 +101,7 @@ nul_validate(struct rpc_task *task, __be32 *p)
 	return p;
 }
 
-struct rpc_authops authnull_ops = {
+const struct rpc_authops authnull_ops = {
 	.owner		= THIS_MODULE,
 	.au_flavor	= RPC_AUTH_NULL,
 #ifdef RPC_DEBUG
@@ -122,7 +122,7 @@ struct rpc_auth null_auth = {
 };
 
 static
-struct rpc_credops	null_credops = {
+const struct rpc_credops null_credops = {
 	.cr_name	= "AUTH_NULL",
 	.crdestroy	= nul_destroy_cred,
 	.crmatch	= nul_match,
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index e54782e75d59..6600c7ad72a9 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -34,7 +34,7 @@ struct unx_cred {
 
 static struct rpc_auth		unix_auth;
 static struct rpc_cred_cache	unix_cred_cache;
-static struct rpc_credops	unix_credops;
+static const struct rpc_credops	unix_credops;
 
 static struct rpc_auth *
 unx_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
@@ -205,7 +205,7 @@ unx_validate(struct rpc_task *task, __be32 *p)
 	return p;
 }
 
-struct rpc_authops	authunix_ops = {
+const struct rpc_authops authunix_ops = {
 	.owner		= THIS_MODULE,
 	.au_flavor	= RPC_AUTH_UNIX,
 #ifdef RPC_DEBUG
@@ -233,7 +233,7 @@ struct rpc_auth		unix_auth = {
 };
 
 static
-struct rpc_credops	unix_credops = {
+const struct rpc_credops unix_credops = {
 	.cr_name	= "AUTH_UNIX",
 	.crdestroy	= unx_destroy_cred,
 	.crmatch	= unx_match,
-- 
cgit v1.3-8-gc7d7


From 5fe4755e2526a2aa82b7ed8daeb3aed74a236925 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 23 Jun 2007 19:55:31 -0400
Subject: SUNRPC: Clean up rpc credential initialisation

Add a helper rpc_cred_init()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h     |  1 +
 include/linux/sunrpc/auth_gss.h |  5 -----
 net/sunrpc/auth.c               | 24 ++++++++++++++++++------
 net/sunrpc/auth_gss/auth_gss.c  |  6 +-----
 net/sunrpc/auth_unix.c          | 10 ++--------
 5 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index e606c2804685..d5bfc67461fc 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -127,6 +127,7 @@ int			rpcauth_unregister(const struct rpc_authops *);
 struct rpc_auth *	rpcauth_create(rpc_authflavor_t, struct rpc_clnt *);
 void			rpcauth_release(struct rpc_auth *);
 struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int);
+void			rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *);
 struct rpc_cred *	rpcauth_lookupcred(struct rpc_auth *, int);
 struct rpc_cred *	rpcauth_bindcred(struct rpc_task *);
 void			rpcauth_holdcred(struct rpc_task *);
diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h
index 2db2fbf34947..0bd1d06777b9 100644
--- a/include/linux/sunrpc/auth_gss.h
+++ b/include/linux/sunrpc/auth_gss.h
@@ -85,11 +85,6 @@ struct gss_cred {
 	struct gss_upcall_msg	*gc_upcall;
 };
 
-#define gc_uid			gc_base.cr_uid
-#define gc_count		gc_base.cr_count
-#define gc_flags		gc_base.cr_flags
-#define gc_expire		gc_base.cr_expire
-
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_AUTH_GSS_H */
 
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index d3f0f944c0b5..2156327da45b 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -264,13 +264,9 @@ retry:
 
 	if (!cred) {
 		new = auth->au_ops->crcreate(auth, acred, flags);
-		if (!IS_ERR(new)) {
-#ifdef RPC_DEBUG
-			new->cr_magic = RPCAUTH_CRED_MAGIC;
-#endif
+		if (!IS_ERR(new))
 			goto retry;
-		} else
-			cred = new;
+		cred = new;
 	} else if ((cred->cr_flags & RPCAUTH_CRED_NEW)
 			&& cred->cr_ops->cr_init != NULL
 			&& !(flags & RPCAUTH_LOOKUP_NEW)) {
@@ -302,6 +298,22 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags)
 	return ret;
 }
 
+void
+rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
+		  struct rpc_auth *auth, const struct rpc_credops *ops)
+{
+	INIT_HLIST_NODE(&cred->cr_hash);
+	atomic_set(&cred->cr_count, 1);
+	cred->cr_auth = auth;
+	cred->cr_ops = ops;
+	cred->cr_expire = jiffies;
+#ifdef RPC_DEBUG
+	cred->cr_magic = RPCAUTH_CRED_MAGIC;
+#endif
+	cred->cr_uid = acred->uid;
+}
+EXPORT_SYMBOL(rpcauth_init_cred);
+
 struct rpc_cred *
 rpcauth_bindcred(struct rpc_task *task)
 {
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 177a9e413c0a..766de0a41b22 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -727,15 +727,11 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 	if (!(cred = kzalloc(sizeof(*cred), GFP_KERNEL)))
 		goto out_err;
 
-	atomic_set(&cred->gc_count, 1);
-	cred->gc_uid = acred->uid;
+	rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops);
 	/*
 	 * Note: in order to force a call to call_refresh(), we deliberately
 	 * fail to flag the credential as RPCAUTH_CRED_UPTODATE.
 	 */
-	cred->gc_flags = 0;
-	cred->gc_base.cr_auth = auth;
-	cred->gc_base.cr_ops = &gss_credops;
 	cred->gc_base.cr_flags = RPCAUTH_CRED_NEW;
 	cred->gc_service = gss_auth->service;
 	return &cred->gc_base;
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 6600c7ad72a9..2f1bdb5c86b3 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -20,9 +20,6 @@ struct unx_cred {
 	gid_t			uc_gids[NFS_NGROUPS];
 };
 #define uc_uid			uc_base.cr_uid
-#define uc_count		uc_base.cr_count
-#define uc_flags		uc_base.cr_flags
-#define uc_expire		uc_base.cr_expire
 
 #define UNX_CRED_EXPIRE		(60 * HZ)
 
@@ -74,8 +71,8 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 	if (!(cred = kmalloc(sizeof(*cred), GFP_KERNEL)))
 		return ERR_PTR(-ENOMEM);
 
-	atomic_set(&cred->uc_count, 1);
-	cred->uc_flags = RPCAUTH_CRED_UPTODATE;
+	rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops);
+	cred->uc_base.cr_flags = RPCAUTH_CRED_UPTODATE;
 	if (flags & RPCAUTH_LOOKUP_ROOTCREDS) {
 		cred->uc_uid = 0;
 		cred->uc_gid = 0;
@@ -85,15 +82,12 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 		if (groups > NFS_NGROUPS)
 			groups = NFS_NGROUPS;
 
-		cred->uc_uid = acred->uid;
 		cred->uc_gid = acred->gid;
 		for (i = 0; i < groups; i++)
 			cred->uc_gids[i] = GROUP_AT(acred->group_info, i);
 		if (i < NFS_NGROUPS)
 		  cred->uc_gids[i] = NOGROUP;
 	}
-	cred->uc_base.cr_auth = &unix_auth;
-	cred->uc_base.cr_ops = &unix_credops;
 
 	return (struct rpc_cred *) cred;
 }
-- 
cgit v1.3-8-gc7d7


From fc432dd90760a629c57026e57f65ff80a1a31d2f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 25 Jun 2007 10:15:15 -0400
Subject: SUNRPC: Enforce atomic updates of rpc_cred->cr_flags

Convert to the use of atomic bitops...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h    | 10 +++++-----
 net/sunrpc/auth.c              | 22 ++++++++++++----------
 net/sunrpc/auth_gss/auth_gss.c | 22 +++++++++++-----------
 net/sunrpc/auth_null.c         |  4 ++--
 net/sunrpc/auth_unix.c         |  4 ++--
 5 files changed, 32 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index d5bfc67461fc..8586503d5ebd 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -36,19 +36,19 @@ struct rpc_cred {
 	struct hlist_node	cr_hash;	/* hash chain */
 	struct rpc_auth *	cr_auth;
 	const struct rpc_credops *cr_ops;
-	unsigned long		cr_expire;	/* when to gc */
-	atomic_t		cr_count;	/* ref count */
-	unsigned short		cr_flags;	/* various flags */
 #ifdef RPC_DEBUG
 	unsigned long		cr_magic;	/* 0x0f4aa4f0 */
 #endif
+	unsigned long		cr_expire;	/* when to gc */
+	unsigned long		cr_flags;	/* various flags */
+	atomic_t		cr_count;	/* ref count */
 
 	uid_t			cr_uid;
 
 	/* per-flavor data */
 };
-#define RPCAUTH_CRED_NEW	0x0001
-#define RPCAUTH_CRED_UPTODATE	0x0002
+#define RPCAUTH_CRED_NEW	0
+#define RPCAUTH_CRED_UPTODATE	1
 
 #define RPCAUTH_CRED_MAGIC	0x0f4aa4f0
 
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 2156327da45b..4d7c78b05d1e 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -190,8 +190,8 @@ rpcauth_prune_expired(struct rpc_auth *auth, struct rpc_cred *cred, struct hlist
 	if (atomic_read(&cred->cr_count) != 1)
 	       return;
 	if (time_after(jiffies, cred->cr_expire + auth->au_credcache->expire))
-		cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
-	if (!(cred->cr_flags & RPCAUTH_CRED_UPTODATE)) {
+		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+	if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0) {
 		__hlist_del(&cred->cr_hash);
 		hlist_add_head(&cred->cr_hash, free);
 	}
@@ -267,7 +267,7 @@ retry:
 		if (!IS_ERR(new))
 			goto retry;
 		cred = new;
-	} else if ((cred->cr_flags & RPCAUTH_CRED_NEW)
+	} else if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags)
 			&& cred->cr_ops->cr_init != NULL
 			&& !(flags & RPCAUTH_LOOKUP_NEW)) {
 		int res = cred->cr_ops->cr_init(auth, cred);
@@ -440,17 +440,19 @@ rpcauth_refreshcred(struct rpc_task *task)
 void
 rpcauth_invalcred(struct rpc_task *task)
 {
+	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+
 	dprintk("RPC: %5u invalidating %s cred %p\n",
-		task->tk_pid, task->tk_auth->au_ops->au_name, task->tk_msg.rpc_cred);
-	spin_lock(&rpc_credcache_lock);
-	if (task->tk_msg.rpc_cred)
-		task->tk_msg.rpc_cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
-	spin_unlock(&rpc_credcache_lock);
+		task->tk_pid, task->tk_auth->au_ops->au_name, cred);
+	if (cred)
+		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
 }
 
 int
 rpcauth_uptodatecred(struct rpc_task *task)
 {
-	return !(task->tk_msg.rpc_cred) ||
-		(task->tk_msg.rpc_cred->cr_flags & RPCAUTH_CRED_UPTODATE);
+	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+
+	return cred == NULL ||
+		test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0;
 }
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 766de0a41b22..55c47ae0a258 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -114,8 +114,8 @@ gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx)
 	write_lock(&gss_ctx_lock);
 	old = gss_cred->gc_ctx;
 	gss_cred->gc_ctx = ctx;
-	cred->cr_flags |= RPCAUTH_CRED_UPTODATE;
-	cred->cr_flags &= ~RPCAUTH_CRED_NEW;
+	set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+	clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags);
 	write_unlock(&gss_ctx_lock);
 	if (old)
 		gss_put_ctx(old);
@@ -128,7 +128,7 @@ gss_cred_is_uptodate_ctx(struct rpc_cred *cred)
 	int res = 0;
 
 	read_lock(&gss_ctx_lock);
-	if ((cred->cr_flags & RPCAUTH_CRED_UPTODATE) && gss_cred->gc_ctx)
+	if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) && gss_cred->gc_ctx)
 		res = 1;
 	read_unlock(&gss_ctx_lock);
 	return res;
@@ -732,7 +732,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 	 * Note: in order to force a call to call_refresh(), we deliberately
 	 * fail to flag the credential as RPCAUTH_CRED_UPTODATE.
 	 */
-	cred->gc_base.cr_flags = RPCAUTH_CRED_NEW;
+	cred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_NEW;
 	cred->gc_service = gss_auth->service;
 	return &cred->gc_base;
 
@@ -764,7 +764,7 @@ gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags)
 	 * we don't really care if the credential has expired or not,
 	 * since the caller should be prepared to reinitialise it.
 	 */
-	if ((flags & RPCAUTH_LOOKUP_NEW) && (rc->cr_flags & RPCAUTH_CRED_NEW))
+	if ((flags & RPCAUTH_LOOKUP_NEW) && test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags))
 		goto out;
 	/* Don't match with creds that have expired. */
 	if (gss_cred->gc_ctx && time_after(jiffies, gss_cred->gc_ctx->gc_expiry))
@@ -820,7 +820,7 @@ gss_marshal(struct rpc_task *task, __be32 *p)
 	mic.data = (u8 *)(p + 1);
 	maj_stat = gss_get_mic(ctx->gc_gss_ctx, &verf_buf, &mic);
 	if (maj_stat == GSS_S_CONTEXT_EXPIRED) {
-		cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
+		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
 	} else if (maj_stat != 0) {
 		printk("gss_marshal: gss_get_mic FAILED (%d)\n", maj_stat);
 		goto out_put_ctx;
@@ -873,7 +873,7 @@ gss_validate(struct rpc_task *task, __be32 *p)
 
 	maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic);
 	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
-		cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
+		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
 	if (maj_stat)
 		goto out_bad;
 	/* We leave it to unwrap to calculate au_rslack. For now we just
@@ -927,7 +927,7 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 	maj_stat = gss_get_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
 	status = -EIO; /* XXX? */
 	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
-		cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
+		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
 	else if (maj_stat)
 		return status;
 	q = xdr_encode_opaque(p, NULL, mic.len);
@@ -1026,7 +1026,7 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 	/* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was
 	 * done anyway, so it's safe to put the request on the wire: */
 	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
-		cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
+		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
 	else if (maj_stat)
 		return status;
 
@@ -1113,7 +1113,7 @@ gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 
 	maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
 	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
-		cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
+		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
 	if (maj_stat != GSS_S_COMPLETE)
 		return status;
 	return 0;
@@ -1138,7 +1138,7 @@ gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 
 	maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, rcv_buf);
 	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
-		cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE;
+		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
 	if (maj_stat != GSS_S_COMPLETE)
 		return status;
 	if (ntohl(*(*p)++) != rqstp->rq_seqno)
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index fe9b6aaf91eb..6c905fb11c5d 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -76,7 +76,7 @@ nul_marshal(struct rpc_task *task, __be32 *p)
 static int
 nul_refresh(struct rpc_task *task)
 {
-	task->tk_msg.rpc_cred->cr_flags |= RPCAUTH_CRED_UPTODATE;
+	set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_msg.rpc_cred->cr_flags);
 	return 0;
 }
 
@@ -136,7 +136,7 @@ struct rpc_cred null_cred = {
 	.cr_auth	= &null_auth,
 	.cr_ops		= &null_credops,
 	.cr_count	= ATOMIC_INIT(1),
-	.cr_flags	= RPCAUTH_CRED_UPTODATE,
+	.cr_flags	= 1UL << RPCAUTH_CRED_UPTODATE,
 #ifdef RPC_DEBUG
 	.cr_magic	= RPCAUTH_CRED_MAGIC,
 #endif
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index f17dabbab1c7..29d50ffa69d6 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -72,7 +72,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 		return ERR_PTR(-ENOMEM);
 
 	rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops);
-	cred->uc_base.cr_flags = RPCAUTH_CRED_UPTODATE;
+	cred->uc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE;
 	if (flags & RPCAUTH_LOOKUP_ROOTCREDS) {
 		cred->uc_uid = 0;
 		cred->uc_gid = 0;
@@ -172,7 +172,7 @@ unx_marshal(struct rpc_task *task, __be32 *p)
 static int
 unx_refresh(struct rpc_task *task)
 {
-	task->tk_msg.rpc_cred->cr_flags |= RPCAUTH_CRED_UPTODATE;
+	set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_msg.rpc_cred->cr_flags);
 	return 0;
 }
 
-- 
cgit v1.3-8-gc7d7


From e092bdcd939416ef911090890096fe07d0281a5e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 23 Jun 2007 19:45:36 -0400
Subject: SUNRPC: cleanup rpc credential cache garbage collection

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h |   1 +
 net/sunrpc/auth.c           | 121 ++++++++++++++++++++++++++------------------
 net/sunrpc/auth_null.c      |   1 +
 3 files changed, 74 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 8586503d5ebd..4e78f0c5f014 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -34,6 +34,7 @@ struct rpc_auth;
 struct rpc_credops;
 struct rpc_cred {
 	struct hlist_node	cr_hash;	/* hash chain */
+	struct list_head	cr_lru;		/* lru garbage collection */
 	struct rpc_auth *	cr_auth;
 	const struct rpc_credops *cr_ops;
 #ifdef RPC_DEBUG
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 4d7c78b05d1e..00f9649b0901 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -25,6 +25,8 @@ static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
 	NULL,			/* others can be loadable modules */
 };
 
+static LIST_HEAD(cred_unused);
+
 static u32
 pseudoflavor_to_flavor(u32 flavor) {
 	if (flavor >= RPC_AUTH_MAXFLAVOR)
@@ -134,13 +136,13 @@ rpcauth_init_credcache(struct rpc_auth *auth, unsigned long expire)
  * Destroy a list of credentials
  */
 static inline
-void rpcauth_destroy_credlist(struct hlist_head *head)
+void rpcauth_destroy_credlist(struct list_head *head)
 {
 	struct rpc_cred *cred;
 
-	while (!hlist_empty(head)) {
-		cred = hlist_entry(head->first, struct rpc_cred, cr_hash);
-		hlist_del_init(&cred->cr_hash);
+	while (!list_empty(head)) {
+		cred = list_entry(head->next, struct rpc_cred, cr_lru);
+		list_del_init(&cred->cr_lru);
 		put_rpccred(cred);
 	}
 }
@@ -152,17 +154,20 @@ void rpcauth_destroy_credlist(struct hlist_head *head)
 void
 rpcauth_clear_credcache(struct rpc_cred_cache *cache)
 {
-	HLIST_HEAD(free);
-	struct hlist_node *pos, *next;
+	LIST_HEAD(free);
+	struct hlist_head *head;
 	struct rpc_cred	*cred;
 	int		i;
 
 	spin_lock(&rpc_credcache_lock);
 	for (i = 0; i < RPC_CREDCACHE_NR; i++) {
-		hlist_for_each_safe(pos, next, &cache->hashtable[i]) {
-			cred = hlist_entry(pos, struct rpc_cred, cr_hash);
-			__hlist_del(&cred->cr_hash);
-			hlist_add_head(&cred->cr_hash, &free);
+		head = &cache->hashtable[i];
+		while (!hlist_empty(head)) {
+			cred = hlist_entry(head->first, struct rpc_cred, cr_hash);
+			get_rpccred(cred);
+			list_move_tail(&cred->cr_lru, &free);
+			smp_wmb();
+			hlist_del_init(&cred->cr_hash);
 		}
 	}
 	spin_unlock(&rpc_credcache_lock);
@@ -184,38 +189,39 @@ rpcauth_destroy_credcache(struct rpc_auth *auth)
 	}
 }
 
+/*
+ * Remove stale credentials. Avoid sleeping inside the loop.
+ */
 static void
-rpcauth_prune_expired(struct rpc_auth *auth, struct rpc_cred *cred, struct hlist_head *free)
+rpcauth_prune_expired(struct list_head *free)
 {
-	if (atomic_read(&cred->cr_count) != 1)
-	       return;
-	if (time_after(jiffies, cred->cr_expire + auth->au_credcache->expire))
-		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
-	if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0) {
-		__hlist_del(&cred->cr_hash);
-		hlist_add_head(&cred->cr_hash, free);
+	struct rpc_cred *cred;
+
+	while (!list_empty(&cred_unused)) {
+		cred = list_entry(cred_unused.next, struct rpc_cred, cr_lru);
+		if (time_after(jiffies, cred->cr_expire +
+					cred->cr_auth->au_credcache->expire))
+			break;
+		list_del_init(&cred->cr_lru);
+		if (atomic_read(&cred->cr_count) != 0)
+			continue;
+		get_rpccred(cred);
+		list_add_tail(&cred->cr_lru, free);
+		smp_wmb();
+		hlist_del_init(&cred->cr_hash);
 	}
 }
 
 /*
- * Remove stale credentials. Avoid sleeping inside the loop.
+ * Run garbage collector.
  */
 static void
-rpcauth_gc_credcache(struct rpc_auth *auth, struct hlist_head *free)
+rpcauth_gc_credcache(struct rpc_cred_cache *cache, struct list_head *free)
 {
-	struct rpc_cred_cache *cache = auth->au_credcache;
-	struct hlist_node *pos, *next;
-	struct rpc_cred	*cred;
-	int		i;
-
-	dprintk("RPC:       gc'ing RPC credentials for auth %p\n", auth);
-	for (i = 0; i < RPC_CREDCACHE_NR; i++) {
-		hlist_for_each_safe(pos, next, &cache->hashtable[i]) {
-			cred = hlist_entry(pos, struct rpc_cred, cr_hash);
-			rpcauth_prune_expired(auth, cred, free);
-		}
-	}
+	if (time_before(jiffies, cache->nextgc))
+		return;
 	cache->nextgc = jiffies + cache->expire;
+	rpcauth_prune_expired(free);
 }
 
 /*
@@ -225,39 +231,35 @@ struct rpc_cred *
 rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
 		int flags)
 {
+	LIST_HEAD(free);
 	struct rpc_cred_cache *cache = auth->au_credcache;
-	HLIST_HEAD(free);
-	struct hlist_node *pos, *next;
+	struct hlist_node *pos;
 	struct rpc_cred	*new = NULL,
-			*cred = NULL;
+			*cred = NULL,
+			*entry;
 	int		nr = 0;
 
 	if (!(flags & RPCAUTH_LOOKUP_ROOTCREDS))
 		nr = acred->uid & RPC_CREDCACHE_MASK;
 retry:
 	spin_lock(&rpc_credcache_lock);
-	if (time_before(cache->nextgc, jiffies))
-		rpcauth_gc_credcache(auth, &free);
-	hlist_for_each_safe(pos, next, &cache->hashtable[nr]) {
-		struct rpc_cred *entry;
-		entry = hlist_entry(pos, struct rpc_cred, cr_hash);
-		if (entry->cr_ops->crmatch(acred, entry, flags)) {
-			hlist_del(&entry->cr_hash);
-			cred = entry;
-			break;
-		}
-		rpcauth_prune_expired(auth, entry, &free);
+	hlist_for_each_entry(entry, pos, &cache->hashtable[nr], cr_hash) {
+		if (!entry->cr_ops->crmatch(acred, entry, flags))
+			continue;
+		cred = get_rpccred(entry);
+		hlist_del(&entry->cr_hash);
+		break;
 	}
 	if (new) {
 		if (cred)
-			hlist_add_head(&new->cr_hash, &free);
+			list_add_tail(&new->cr_lru, &free);
 		else
 			cred = new;
 	}
 	if (cred) {
 		hlist_add_head(&cred->cr_hash, &cache->hashtable[nr]);
-		get_rpccred(cred);
 	}
+	rpcauth_gc_credcache(cache, &free);
 	spin_unlock(&rpc_credcache_lock);
 
 	rpcauth_destroy_credlist(&free);
@@ -303,6 +305,7 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
 		  struct rpc_auth *auth, const struct rpc_credops *ops)
 {
 	INIT_HLIST_NODE(&cred->cr_hash);
+	INIT_LIST_HEAD(&cred->cr_lru);
 	atomic_set(&cred->cr_count, 1);
 	cred->cr_auth = auth;
 	cred->cr_ops = ops;
@@ -353,9 +356,29 @@ rpcauth_holdcred(struct rpc_task *task)
 void
 put_rpccred(struct rpc_cred *cred)
 {
-	cred->cr_expire = jiffies;
+	/* Fast path for unhashed credentials */
+	if (!hlist_unhashed(&cred->cr_hash))
+		goto need_lock;
+
 	if (!atomic_dec_and_test(&cred->cr_count))
 		return;
+	goto out_destroy;
+
+need_lock:
+	if (!atomic_dec_and_lock(&cred->cr_count, &rpc_credcache_lock))
+		return;
+	if (!list_empty(&cred->cr_lru))
+		list_del_init(&cred->cr_lru);
+	if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
+		hlist_del(&cred->cr_hash);
+	else if (!hlist_unhashed(&cred->cr_hash)) {
+		cred->cr_expire = jiffies;
+		list_add_tail(&cred->cr_lru, &cred_unused);
+		spin_unlock(&rpc_credcache_lock);
+		return;
+	}
+	spin_unlock(&rpc_credcache_lock);
+out_destroy:
 	cred->cr_ops->crdestroy(cred);
 }
 
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 6c905fb11c5d..537d0e8589dd 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -133,6 +133,7 @@ const struct rpc_credops null_credops = {
 
 static
 struct rpc_cred null_cred = {
+	.cr_lru		= LIST_HEAD_INIT(null_cred.cr_lru),
 	.cr_auth	= &null_auth,
 	.cr_ops		= &null_credops,
 	.cr_count	= ATOMIC_INIT(1),
-- 
cgit v1.3-8-gc7d7


From 31be5bf15f3dafffce110eb1afadccbf2e3067b4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 24 Jun 2007 15:55:26 -0400
Subject: SUNRPC: Convert the credcache lookup code to use RCU

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h    |  3 ++
 net/sunrpc/auth.c              | 91 ++++++++++++++++++++++++++----------------
 net/sunrpc/auth_gss/auth_gss.c | 22 +++++++---
 net/sunrpc/auth_unix.c         | 18 +++++++--
 4 files changed, 91 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 4e78f0c5f014..5974e8a493c4 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -16,6 +16,7 @@
 #include <linux/sunrpc/xdr.h>
 
 #include <asm/atomic.h>
+#include <linux/rcupdate.h>
 
 /* size of the nodename buffer */
 #define UNX_MAXNODENAME	32
@@ -35,6 +36,7 @@ struct rpc_credops;
 struct rpc_cred {
 	struct hlist_node	cr_hash;	/* hash chain */
 	struct list_head	cr_lru;		/* lru garbage collection */
+	struct rcu_head		cr_rcu;
 	struct rpc_auth *	cr_auth;
 	const struct rpc_credops *cr_ops;
 #ifdef RPC_DEBUG
@@ -50,6 +52,7 @@ struct rpc_cred {
 };
 #define RPCAUTH_CRED_NEW	0
 #define RPCAUTH_CRED_UPTODATE	1
+#define RPCAUTH_CRED_HASHED	2
 
 #define RPCAUTH_CRED_MAGIC	0x0f4aa4f0
 
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 00f9649b0901..ad7bde2c437e 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -112,6 +112,14 @@ rpcauth_release(struct rpc_auth *auth)
 
 static DEFINE_SPINLOCK(rpc_credcache_lock);
 
+static void
+rpcauth_unhash_cred_locked(struct rpc_cred *cred)
+{
+	hlist_del_rcu(&cred->cr_hash);
+	smp_mb__before_clear_bit();
+	clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags);
+}
+
 /*
  * Initialize RPC credential cache
  */
@@ -166,8 +174,7 @@ rpcauth_clear_credcache(struct rpc_cred_cache *cache)
 			cred = hlist_entry(head->first, struct rpc_cred, cr_hash);
 			get_rpccred(cred);
 			list_move_tail(&cred->cr_lru, &free);
-			smp_wmb();
-			hlist_del_init(&cred->cr_hash);
+			rpcauth_unhash_cred_locked(cred);
 		}
 	}
 	spin_unlock(&rpc_credcache_lock);
@@ -207,8 +214,7 @@ rpcauth_prune_expired(struct list_head *free)
 			continue;
 		get_rpccred(cred);
 		list_add_tail(&cred->cr_lru, free);
-		smp_wmb();
-		hlist_del_init(&cred->cr_hash);
+		rpcauth_unhash_cred_locked(cred);
 	}
 }
 
@@ -218,10 +224,12 @@ rpcauth_prune_expired(struct list_head *free)
 static void
 rpcauth_gc_credcache(struct rpc_cred_cache *cache, struct list_head *free)
 {
-	if (time_before(jiffies, cache->nextgc))
+	if (list_empty(&cred_unused) || time_before(jiffies, cache->nextgc))
 		return;
+	spin_lock(&rpc_credcache_lock);
 	cache->nextgc = jiffies + cache->expire;
 	rpcauth_prune_expired(free);
+	spin_unlock(&rpc_credcache_lock);
 }
 
 /*
@@ -234,42 +242,57 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
 	LIST_HEAD(free);
 	struct rpc_cred_cache *cache = auth->au_credcache;
 	struct hlist_node *pos;
-	struct rpc_cred	*new = NULL,
-			*cred = NULL,
-			*entry;
+	struct rpc_cred	*cred = NULL,
+			*entry, *new;
 	int		nr = 0;
 
 	if (!(flags & RPCAUTH_LOOKUP_ROOTCREDS))
 		nr = acred->uid & RPC_CREDCACHE_MASK;
-retry:
-	spin_lock(&rpc_credcache_lock);
-	hlist_for_each_entry(entry, pos, &cache->hashtable[nr], cr_hash) {
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(entry, pos, &cache->hashtable[nr], cr_hash) {
 		if (!entry->cr_ops->crmatch(acred, entry, flags))
 			continue;
+		spin_lock(&rpc_credcache_lock);
+		if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) {
+			spin_unlock(&rpc_credcache_lock);
+			continue;
+		}
 		cred = get_rpccred(entry);
-		hlist_del(&entry->cr_hash);
+		spin_unlock(&rpc_credcache_lock);
 		break;
 	}
-	if (new) {
-		if (cred)
-			list_add_tail(&new->cr_lru, &free);
-		else
-			cred = new;
-	}
-	if (cred) {
-		hlist_add_head(&cred->cr_hash, &cache->hashtable[nr]);
+	rcu_read_unlock();
+
+	if (cred != NULL) {
+		rpcauth_gc_credcache(cache, &free);
+		goto found;
 	}
-	rpcauth_gc_credcache(cache, &free);
-	spin_unlock(&rpc_credcache_lock);
 
-	rpcauth_destroy_credlist(&free);
+	new = auth->au_ops->crcreate(auth, acred, flags);
+	if (IS_ERR(new)) {
+		cred = new;
+		goto out;
+	}
 
-	if (!cred) {
-		new = auth->au_ops->crcreate(auth, acred, flags);
-		if (!IS_ERR(new))
-			goto retry;
+	spin_lock(&rpc_credcache_lock);
+	hlist_for_each_entry(entry, pos, &cache->hashtable[nr], cr_hash) {
+		if (!entry->cr_ops->crmatch(acred, entry, flags))
+			continue;
+		cred = get_rpccred(entry);
+		break;
+	}
+	if (cred == NULL) {
 		cred = new;
-	} else if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags)
+		set_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags);
+		hlist_add_head_rcu(&cred->cr_hash, &cache->hashtable[nr]);
+	} else
+		list_add_tail(&new->cr_lru, &free);
+	rpcauth_prune_expired(&free);
+	cache->nextgc = jiffies + cache->expire;
+	spin_unlock(&rpc_credcache_lock);
+found:
+	if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags)
 			&& cred->cr_ops->cr_init != NULL
 			&& !(flags & RPCAUTH_LOOKUP_NEW)) {
 		int res = cred->cr_ops->cr_init(auth, cred);
@@ -278,8 +301,9 @@ retry:
 			cred = ERR_PTR(res);
 		}
 	}
-
-	return (struct rpc_cred *) cred;
+	rpcauth_destroy_credlist(&free);
+out:
+	return cred;
 }
 
 struct rpc_cred *
@@ -357,21 +381,20 @@ void
 put_rpccred(struct rpc_cred *cred)
 {
 	/* Fast path for unhashed credentials */
-	if (!hlist_unhashed(&cred->cr_hash))
+	if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0)
 		goto need_lock;
 
 	if (!atomic_dec_and_test(&cred->cr_count))
 		return;
 	goto out_destroy;
-
 need_lock:
 	if (!atomic_dec_and_lock(&cred->cr_count, &rpc_credcache_lock))
 		return;
 	if (!list_empty(&cred->cr_lru))
 		list_del_init(&cred->cr_lru);
 	if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
-		hlist_del(&cred->cr_hash);
-	else if (!hlist_unhashed(&cred->cr_hash)) {
+		rpcauth_unhash_cred_locked(cred);
+	else if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) {
 		cred->cr_expire = jiffies;
 		list_add_tail(&cred->cr_lru, &cred_unused);
 		spin_unlock(&rpc_credcache_lock);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 55c47ae0a258..068fa6dfb64e 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -694,15 +694,25 @@ gss_destroy_ctx(struct gss_cl_ctx *ctx)
 }
 
 static void
-gss_destroy_cred(struct rpc_cred *rc)
+gss_free_cred(struct gss_cred *gss_cred)
 {
-	struct gss_cred *cred = container_of(rc, struct gss_cred, gc_base);
+	dprintk("RPC:       gss_free_cred %p\n", gss_cred);
+	if (gss_cred->gc_ctx)
+		gss_put_ctx(gss_cred->gc_ctx);
+	kfree(gss_cred);
+}
 
-	dprintk("RPC:       gss_destroy_cred \n");
+static void
+gss_free_cred_callback(struct rcu_head *head)
+{
+	struct gss_cred *gss_cred = container_of(head, struct gss_cred, gc_base.cr_rcu);
+	gss_free_cred(gss_cred);
+}
 
-	if (cred->gc_ctx)
-		gss_put_ctx(cred->gc_ctx);
-	kfree(cred);
+static void
+gss_destroy_cred(struct rpc_cred *cred)
+{
+	call_rcu(&cred->cr_rcu, gss_free_cred_callback);
 }
 
 /*
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 29d50ffa69d6..f7ff6ad3259e 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -93,11 +93,23 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 }
 
 static void
-unx_destroy_cred(struct rpc_cred *rcred)
+unx_free_cred(struct unx_cred *unx_cred)
 {
-	struct unx_cred	*cred = container_of(rcred, struct unx_cred, uc_base);
+	dprintk("RPC:       unx_free_cred %p\n", unx_cred);
+	kfree(unx_cred);
+}
+
+static void
+unx_free_cred_callback(struct rcu_head *head)
+{
+	struct unx_cred *unx_cred = container_of(head, struct unx_cred, uc_base.cr_rcu);
+	unx_free_cred(unx_cred);
+}
 
-	kfree(cred);
+static void
+unx_destroy_cred(struct rpc_cred *cred)
+{
+	call_rcu(&cred->cr_rcu, unx_free_cred_callback);
 }
 
 /*
-- 
cgit v1.3-8-gc7d7


From 9499b4341b56935f61af9e7e354e7d11e70f5258 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 24 Jun 2007 15:57:57 -0400
Subject: SUNRPC: Give credential cache a local spinlock

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h |  3 +++
 net/sunrpc/auth.c           | 46 +++++++++++++++++++++++++++++++--------------
 net/sunrpc/auth_unix.c      |  5 +++++
 net/sunrpc/sunrpc_syms.c    |  1 +
 4 files changed, 41 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 5974e8a493c4..e5a3b5141ed2 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -63,6 +63,7 @@ struct rpc_cred {
 #define RPC_CREDCACHE_MASK	(RPC_CREDCACHE_NR - 1)
 struct rpc_cred_cache {
 	struct hlist_head	hashtable[RPC_CREDCACHE_NR];
+	spinlock_t		lock;
 	unsigned long		nextgc;		/* next garbage collection */
 	unsigned long		expire;		/* cache expiry interval */
 };
@@ -126,6 +127,8 @@ struct rpc_credops {
 extern const struct rpc_authops	authunix_ops;
 extern const struct rpc_authops	authnull_ops;
 
+void __init		rpc_init_authunix(void);
+
 int			rpcauth_register(const struct rpc_authops *);
 int			rpcauth_unregister(const struct rpc_authops *);
 struct rpc_auth *	rpcauth_create(rpc_authflavor_t, struct rpc_clnt *);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index ad7bde2c437e..cf1198d10ee9 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -120,6 +120,18 @@ rpcauth_unhash_cred_locked(struct rpc_cred *cred)
 	clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags);
 }
 
+static void
+rpcauth_unhash_cred(struct rpc_cred *cred)
+{
+	spinlock_t *cache_lock;
+
+	cache_lock = &cred->cr_auth->au_credcache->lock;
+	spin_lock(cache_lock);
+	if (atomic_read(&cred->cr_count) == 0)
+		rpcauth_unhash_cred_locked(cred);
+	spin_unlock(cache_lock);
+}
+
 /*
  * Initialize RPC credential cache
  */
@@ -134,6 +146,7 @@ rpcauth_init_credcache(struct rpc_auth *auth, unsigned long expire)
 		return -ENOMEM;
 	for (i = 0; i < RPC_CREDCACHE_NR; i++)
 		INIT_HLIST_HEAD(&new->hashtable[i]);
+	spin_lock_init(&new->lock);
 	new->expire = expire;
 	new->nextgc = jiffies + (expire >> 1);
 	auth->au_credcache = new;
@@ -168,6 +181,7 @@ rpcauth_clear_credcache(struct rpc_cred_cache *cache)
 	int		i;
 
 	spin_lock(&rpc_credcache_lock);
+	spin_lock(&cache->lock);
 	for (i = 0; i < RPC_CREDCACHE_NR; i++) {
 		head = &cache->hashtable[i];
 		while (!hlist_empty(head)) {
@@ -177,6 +191,7 @@ rpcauth_clear_credcache(struct rpc_cred_cache *cache)
 			rpcauth_unhash_cred_locked(cred);
 		}
 	}
+	spin_unlock(&cache->lock);
 	spin_unlock(&rpc_credcache_lock);
 	rpcauth_destroy_credlist(&free);
 }
@@ -202,6 +217,7 @@ rpcauth_destroy_credcache(struct rpc_auth *auth)
 static void
 rpcauth_prune_expired(struct list_head *free)
 {
+	spinlock_t *cache_lock;
 	struct rpc_cred *cred;
 
 	while (!list_empty(&cred_unused)) {
@@ -212,9 +228,14 @@ rpcauth_prune_expired(struct list_head *free)
 		list_del_init(&cred->cr_lru);
 		if (atomic_read(&cred->cr_count) != 0)
 			continue;
-		get_rpccred(cred);
-		list_add_tail(&cred->cr_lru, free);
-		rpcauth_unhash_cred_locked(cred);
+		cache_lock = &cred->cr_auth->au_credcache->lock;
+		spin_lock(cache_lock);
+		if (atomic_read(&cred->cr_count) == 0) {
+			get_rpccred(cred);
+			list_add_tail(&cred->cr_lru, free);
+			rpcauth_unhash_cred_locked(cred);
+		}
+		spin_unlock(cache_lock);
 	}
 }
 
@@ -253,21 +274,19 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
 	hlist_for_each_entry_rcu(entry, pos, &cache->hashtable[nr], cr_hash) {
 		if (!entry->cr_ops->crmatch(acred, entry, flags))
 			continue;
-		spin_lock(&rpc_credcache_lock);
+		spin_lock(&cache->lock);
 		if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) {
-			spin_unlock(&rpc_credcache_lock);
+			spin_unlock(&cache->lock);
 			continue;
 		}
 		cred = get_rpccred(entry);
-		spin_unlock(&rpc_credcache_lock);
+		spin_unlock(&cache->lock);
 		break;
 	}
 	rcu_read_unlock();
 
-	if (cred != NULL) {
-		rpcauth_gc_credcache(cache, &free);
+	if (cred != NULL)
 		goto found;
-	}
 
 	new = auth->au_ops->crcreate(auth, acred, flags);
 	if (IS_ERR(new)) {
@@ -275,7 +294,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
 		goto out;
 	}
 
-	spin_lock(&rpc_credcache_lock);
+	spin_lock(&cache->lock);
 	hlist_for_each_entry(entry, pos, &cache->hashtable[nr], cr_hash) {
 		if (!entry->cr_ops->crmatch(acred, entry, flags))
 			continue;
@@ -288,9 +307,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
 		hlist_add_head_rcu(&cred->cr_hash, &cache->hashtable[nr]);
 	} else
 		list_add_tail(&new->cr_lru, &free);
-	rpcauth_prune_expired(&free);
-	cache->nextgc = jiffies + cache->expire;
-	spin_unlock(&rpc_credcache_lock);
+	spin_unlock(&cache->lock);
 found:
 	if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags)
 			&& cred->cr_ops->cr_init != NULL
@@ -301,6 +318,7 @@ found:
 			cred = ERR_PTR(res);
 		}
 	}
+	rpcauth_gc_credcache(cache, &free);
 	rpcauth_destroy_credlist(&free);
 out:
 	return cred;
@@ -393,7 +411,7 @@ need_lock:
 	if (!list_empty(&cred->cr_lru))
 		list_del_init(&cred->cr_lru);
 	if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
-		rpcauth_unhash_cred_locked(cred);
+		rpcauth_unhash_cred(cred);
 	else if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) {
 		cred->cr_expire = jiffies;
 		list_add_tail(&cred->cr_lru, &cred_unused);
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index f7ff6ad3259e..205878a3caa5 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -213,6 +213,11 @@ unx_validate(struct rpc_task *task, __be32 *p)
 	return p;
 }
 
+void __init rpc_init_authunix(void)
+{
+	spin_lock_init(&unix_cred_cache.lock);
+}
+
 const struct rpc_authops authunix_ops = {
 	.owner		= THIS_MODULE,
 	.au_flavor	= RPC_AUTH_UNIX,
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 3e19e7af6799..018065fca84b 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -152,6 +152,7 @@ init_sunrpc(void)
 	cache_register(&ip_map_cache);
 	cache_register(&unix_gid_cache);
 	init_socket_xprt();
+	rpc_init_authunix();
 out:
 	return err;
 }
-- 
cgit v1.3-8-gc7d7


From f5c2187cfef628784d8a09b6d0f77888246d0c0f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 25 Jun 2007 17:11:20 -0400
Subject: SUNRPC: Convert the credential garbage collector into a shrinker
 callback

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth.h    |  6 ++--
 net/sunrpc/auth.c              | 63 ++++++++++++++++++++++++++++++------------
 net/sunrpc/auth_gss/auth_gss.c |  3 +-
 net/sunrpc/auth_unix.c         |  6 +---
 net/sunrpc/sunrpc_syms.c       |  3 +-
 5 files changed, 52 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index e5a3b5141ed2..7a69ca3bebaf 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -64,8 +64,6 @@ struct rpc_cred {
 struct rpc_cred_cache {
 	struct hlist_head	hashtable[RPC_CREDCACHE_NR];
 	spinlock_t		lock;
-	unsigned long		nextgc;		/* next garbage collection */
-	unsigned long		expire;		/* cache expiry interval */
 };
 
 struct rpc_authops;
@@ -128,6 +126,8 @@ extern const struct rpc_authops	authunix_ops;
 extern const struct rpc_authops	authnull_ops;
 
 void __init		rpc_init_authunix(void);
+void __init		rpcauth_init_module(void);
+void __exit		rpcauth_remove_module(void);
 
 int			rpcauth_register(const struct rpc_authops *);
 int			rpcauth_unregister(const struct rpc_authops *);
@@ -147,7 +147,7 @@ int			rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp,
 int			rpcauth_refreshcred(struct rpc_task *);
 void			rpcauth_invalcred(struct rpc_task *);
 int			rpcauth_uptodatecred(struct rpc_task *);
-int			rpcauth_init_credcache(struct rpc_auth *, unsigned long);
+int			rpcauth_init_credcache(struct rpc_auth *);
 void			rpcauth_destroy_credcache(struct rpc_auth *);
 void			rpcauth_clear_credcache(struct rpc_cred_cache *);
 
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index cf1198d10ee9..81f4c776c558 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -26,6 +26,7 @@ static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
 };
 
 static LIST_HEAD(cred_unused);
+static unsigned long number_cred_unused;
 
 static u32
 pseudoflavor_to_flavor(u32 flavor) {
@@ -136,7 +137,7 @@ rpcauth_unhash_cred(struct rpc_cred *cred)
  * Initialize RPC credential cache
  */
 int
-rpcauth_init_credcache(struct rpc_auth *auth, unsigned long expire)
+rpcauth_init_credcache(struct rpc_auth *auth)
 {
 	struct rpc_cred_cache *new;
 	int i;
@@ -147,8 +148,6 @@ rpcauth_init_credcache(struct rpc_auth *auth, unsigned long expire)
 	for (i = 0; i < RPC_CREDCACHE_NR; i++)
 		INIT_HLIST_HEAD(&new->hashtable[i]);
 	spin_lock_init(&new->lock);
-	new->expire = expire;
-	new->nextgc = jiffies + (expire >> 1);
 	auth->au_credcache = new;
 	return 0;
 }
@@ -187,7 +186,11 @@ rpcauth_clear_credcache(struct rpc_cred_cache *cache)
 		while (!hlist_empty(head)) {
 			cred = hlist_entry(head->first, struct rpc_cred, cr_hash);
 			get_rpccred(cred);
-			list_move_tail(&cred->cr_lru, &free);
+			if (!list_empty(&cred->cr_lru)) {
+				list_del(&cred->cr_lru);
+				number_cred_unused--;
+			}
+			list_add_tail(&cred->cr_lru, &free);
 			rpcauth_unhash_cred_locked(cred);
 		}
 	}
@@ -214,18 +217,16 @@ rpcauth_destroy_credcache(struct rpc_auth *auth)
 /*
  * Remove stale credentials. Avoid sleeping inside the loop.
  */
-static void
-rpcauth_prune_expired(struct list_head *free)
+static int
+rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
 {
 	spinlock_t *cache_lock;
 	struct rpc_cred *cred;
 
 	while (!list_empty(&cred_unused)) {
 		cred = list_entry(cred_unused.next, struct rpc_cred, cr_lru);
-		if (time_after(jiffies, cred->cr_expire +
-					cred->cr_auth->au_credcache->expire))
-			break;
 		list_del_init(&cred->cr_lru);
+		number_cred_unused--;
 		if (atomic_read(&cred->cr_count) != 0)
 			continue;
 		cache_lock = &cred->cr_auth->au_credcache->lock;
@@ -234,23 +235,32 @@ rpcauth_prune_expired(struct list_head *free)
 			get_rpccred(cred);
 			list_add_tail(&cred->cr_lru, free);
 			rpcauth_unhash_cred_locked(cred);
+			nr_to_scan--;
 		}
 		spin_unlock(cache_lock);
+		if (nr_to_scan == 0)
+			break;
 	}
+	return nr_to_scan;
 }
 
 /*
- * Run garbage collector.
+ * Run memory cache shrinker.
  */
-static void
-rpcauth_gc_credcache(struct rpc_cred_cache *cache, struct list_head *free)
+static int
+rpcauth_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
 {
-	if (list_empty(&cred_unused) || time_before(jiffies, cache->nextgc))
-		return;
+	LIST_HEAD(free);
+	int res;
+
+	if (list_empty(&cred_unused))
+		return 0;
 	spin_lock(&rpc_credcache_lock);
-	cache->nextgc = jiffies + cache->expire;
-	rpcauth_prune_expired(free);
+	nr_to_scan = rpcauth_prune_expired(&free, nr_to_scan);
+	res = (number_cred_unused / 100) * sysctl_vfs_cache_pressure;
 	spin_unlock(&rpc_credcache_lock);
+	rpcauth_destroy_credlist(&free);
+	return res;
 }
 
 /*
@@ -318,7 +328,6 @@ found:
 			cred = ERR_PTR(res);
 		}
 	}
-	rpcauth_gc_credcache(cache, &free);
 	rpcauth_destroy_credlist(&free);
 out:
 	return cred;
@@ -408,13 +417,16 @@ put_rpccred(struct rpc_cred *cred)
 need_lock:
 	if (!atomic_dec_and_lock(&cred->cr_count, &rpc_credcache_lock))
 		return;
-	if (!list_empty(&cred->cr_lru))
+	if (!list_empty(&cred->cr_lru)) {
+		number_cred_unused--;
 		list_del_init(&cred->cr_lru);
+	}
 	if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
 		rpcauth_unhash_cred(cred);
 	else if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) {
 		cred->cr_expire = jiffies;
 		list_add_tail(&cred->cr_lru, &cred_unused);
+		number_cred_unused++;
 		spin_unlock(&rpc_credcache_lock);
 		return;
 	}
@@ -520,3 +532,18 @@ rpcauth_uptodatecred(struct rpc_task *task)
 	return cred == NULL ||
 		test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0;
 }
+
+
+static struct shrinker *rpc_cred_shrinker;
+
+void __init rpcauth_init_module(void)
+{
+	rpc_init_authunix();
+	rpc_cred_shrinker = set_shrinker(DEFAULT_SEEKS, rpcauth_cache_shrinker);
+}
+
+void __exit rpcauth_remove_module(void)
+{
+	if (rpc_cred_shrinker != NULL)
+		remove_shrinker(rpc_cred_shrinker);
+}
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 068fa6dfb64e..8653a92144ae 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -64,7 +64,6 @@ static const struct rpc_credops gss_credops;
 
 #define NFS_NGROUPS	16
 
-#define GSS_CRED_EXPIRE		(60 * HZ)	/* XXX: reasonable? */
 #define GSS_CRED_SLACK		1024		/* XXX: unused */
 /* length of a krb5 verifier (48), plus data added before arguments when
  * using integrity (two 4-byte integers): */
@@ -643,7 +642,7 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
 		goto err_put_mech;
 	}
 
-	err = rpcauth_init_credcache(auth, GSS_CRED_EXPIRE);
+	err = rpcauth_init_credcache(auth);
 	if (err)
 		goto err_unlink_pipe;
 
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 205878a3caa5..d9c50d810d15 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -21,8 +21,6 @@ struct unx_cred {
 };
 #define uc_uid			uc_base.cr_uid
 
-#define UNX_CRED_EXPIRE		(60 * HZ)
-
 #define UNX_WRITESLACK		(21 + (UNX_MAXNODENAME >> 2))
 
 #ifdef RPC_DEBUG
@@ -38,8 +36,7 @@ unx_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
 {
 	dprintk("RPC:       creating UNIX authenticator for client %p\n",
 			clnt);
-	if (atomic_inc_return(&unix_auth.au_count) == 1)
-		unix_cred_cache.nextgc = jiffies + (unix_cred_cache.expire >> 1);
+	atomic_inc(&unix_auth.au_count);
 	return &unix_auth;
 }
 
@@ -232,7 +229,6 @@ const struct rpc_authops authunix_ops = {
 
 static
 struct rpc_cred_cache	unix_cred_cache = {
-	.expire		= UNX_CRED_EXPIRE,
 };
 
 static
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 018065fca84b..384c4ad5ab86 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -152,7 +152,7 @@ init_sunrpc(void)
 	cache_register(&ip_map_cache);
 	cache_register(&unix_gid_cache);
 	init_socket_xprt();
-	rpc_init_authunix();
+	rpcauth_init_module();
 out:
 	return err;
 }
@@ -160,6 +160,7 @@ out:
 static void __exit
 cleanup_sunrpc(void)
 {
+	rpcauth_remove_module();
 	cleanup_socket_xprt();
 	unregister_rpc_pipefs();
 	rpc_destroy_mempool();
-- 
cgit v1.3-8-gc7d7


From 5d28dc82074f1e64b22c9424b161abc1f5d6bcdb Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 26 Jun 2007 19:18:38 -0400
Subject: SUNRPC: Convert gss_ctx_lock to an RCU lock

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/auth_gss.h |  1 +
 net/sunrpc/auth_gss/auth_gss.c  | 53 ++++++++++++++++++++++++++++-------------
 2 files changed, 37 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h
index 0bd1d06777b9..67658e17a375 100644
--- a/include/linux/sunrpc/auth_gss.h
+++ b/include/linux/sunrpc/auth_gss.h
@@ -75,6 +75,7 @@ struct gss_cl_ctx {
 	struct xdr_netobj	gc_wire_ctx;
 	u32			gc_win;
 	unsigned long		gc_expiry;
+	struct rcu_head		gc_rcu;
 };
 
 struct gss_upcall_msg;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 8653a92144ae..15da6f82db36 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -78,8 +78,6 @@ static const struct rpc_credops gss_credops;
 /* dump the buffer in `emacs-hexl' style */
 #define isprint(c)      ((c > 0x1f) && (c < 0x7f))
 
-static DEFINE_RWLOCK(gss_ctx_lock);
-
 struct gss_auth {
 	struct rpc_auth rpc_auth;
 	struct gss_api_mech *mech;
@@ -88,7 +86,7 @@ struct gss_auth {
 	struct dentry *dentry;
 };
 
-static void gss_destroy_ctx(struct gss_cl_ctx *);
+static void gss_free_ctx(struct gss_cl_ctx *);
 static struct rpc_pipe_ops gss_upcall_ops;
 
 static inline struct gss_cl_ctx *
@@ -102,20 +100,24 @@ static inline void
 gss_put_ctx(struct gss_cl_ctx *ctx)
 {
 	if (atomic_dec_and_test(&ctx->count))
-		gss_destroy_ctx(ctx);
+		gss_free_ctx(ctx);
 }
 
+/* gss_cred_set_ctx:
+ * called by gss_upcall_callback and gss_create_upcall in order
+ * to set the gss context. The actual exchange of an old context
+ * and a new one is protected by the inode->i_lock.
+ */
 static void
 gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx)
 {
 	struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
 	struct gss_cl_ctx *old;
-	write_lock(&gss_ctx_lock);
+
 	old = gss_cred->gc_ctx;
-	gss_cred->gc_ctx = ctx;
+	rcu_assign_pointer(gss_cred->gc_ctx, ctx);
 	set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
 	clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags);
-	write_unlock(&gss_ctx_lock);
 	if (old)
 		gss_put_ctx(old);
 }
@@ -126,10 +128,10 @@ gss_cred_is_uptodate_ctx(struct rpc_cred *cred)
 	struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
 	int res = 0;
 
-	read_lock(&gss_ctx_lock);
+	rcu_read_lock();
 	if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) && gss_cred->gc_ctx)
 		res = 1;
-	read_unlock(&gss_ctx_lock);
+	rcu_read_unlock();
 	return res;
 }
 
@@ -168,10 +170,10 @@ gss_cred_get_ctx(struct rpc_cred *cred)
 	struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
 	struct gss_cl_ctx *ctx = NULL;
 
-	read_lock(&gss_ctx_lock);
+	rcu_read_lock();
 	if (gss_cred->gc_ctx)
 		ctx = gss_get_ctx(gss_cred->gc_ctx);
-	read_unlock(&gss_ctx_lock);
+	rcu_read_unlock();
 	return ctx;
 }
 
@@ -333,11 +335,11 @@ gss_upcall_callback(struct rpc_task *task)
 	struct gss_upcall_msg *gss_msg = gss_cred->gc_upcall;
 	struct inode *inode = gss_msg->auth->dentry->d_inode;
 
+	spin_lock(&inode->i_lock);
 	if (gss_msg->ctx)
 		gss_cred_set_ctx(task->tk_msg.rpc_cred, gss_get_ctx(gss_msg->ctx));
 	else
 		task->tk_status = gss_msg->msg.errno;
-	spin_lock(&inode->i_lock);
 	gss_cred->gc_upcall = NULL;
 	rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno);
 	spin_unlock(&inode->i_lock);
@@ -440,7 +442,6 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred)
 		prepare_to_wait(&gss_msg->waitqueue, &wait, TASK_INTERRUPTIBLE);
 		spin_lock(&inode->i_lock);
 		if (gss_msg->ctx != NULL || gss_msg->msg.errno < 0) {
-			spin_unlock(&inode->i_lock);
 			break;
 		}
 		spin_unlock(&inode->i_lock);
@@ -454,6 +455,7 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred)
 		gss_cred_set_ctx(cred, gss_get_ctx(gss_msg->ctx));
 	else
 		err = gss_msg->msg.errno;
+	spin_unlock(&inode->i_lock);
 out_intr:
 	finish_wait(&gss_msg->waitqueue, &wait);
 	gss_release_msg(gss_msg);
@@ -681,9 +683,9 @@ gss_destroy(struct rpc_auth *auth)
  * to create a new cred or context, so they check that things have been
  * allocated before freeing them. */
 static void
-gss_destroy_ctx(struct gss_cl_ctx *ctx)
+gss_do_free_ctx(struct gss_cl_ctx *ctx)
 {
-	dprintk("RPC:       gss_destroy_ctx\n");
+	dprintk("RPC:       gss_free_ctx\n");
 
 	if (ctx->gc_gss_ctx)
 		gss_delete_sec_context(&ctx->gc_gss_ctx);
@@ -692,12 +694,23 @@ gss_destroy_ctx(struct gss_cl_ctx *ctx)
 	kfree(ctx);
 }
 
+static void
+gss_free_ctx_callback(struct rcu_head *head)
+{
+	struct gss_cl_ctx *ctx = container_of(head, struct gss_cl_ctx, gc_rcu);
+	gss_do_free_ctx(ctx);
+}
+
+static void
+gss_free_ctx(struct gss_cl_ctx *ctx)
+{
+	call_rcu(&ctx->gc_rcu, gss_free_ctx_callback);
+}
+
 static void
 gss_free_cred(struct gss_cred *gss_cred)
 {
 	dprintk("RPC:       gss_free_cred %p\n", gss_cred);
-	if (gss_cred->gc_ctx)
-		gss_put_ctx(gss_cred->gc_ctx);
 	kfree(gss_cred);
 }
 
@@ -711,7 +724,13 @@ gss_free_cred_callback(struct rcu_head *head)
 static void
 gss_destroy_cred(struct rpc_cred *cred)
 {
+	struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
+	struct gss_cl_ctx *ctx = gss_cred->gc_ctx;
+
+	rcu_assign_pointer(gss_cred->gc_ctx, NULL);
 	call_rcu(&cred->cr_rcu, gss_free_cred_callback);
+	if (ctx)
+		gss_put_ctx(ctx);
 }
 
 /*
-- 
cgit v1.3-8-gc7d7


From 1be27f36601973815171db684c711d30557cf50c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 27 Jun 2007 14:29:04 -0400
Subject: SUNRPC: Remove the tk_auth macro...

We should almost always be deferencing the rpc_auth struct by means of the
credential's cr_auth field instead of the rpc_clnt->cl_auth anyway. Fix up
that historical mistake, and remove the macro that propagated it.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs2xdr.c               |  6 +++---
 fs/nfs/nfs3xdr.c               |  8 ++++----
 fs/nfs/nfs4xdr.c               | 10 +++++-----
 include/linux/sunrpc/sched.h   |  1 -
 net/sunrpc/auth.c              | 25 +++++++++++++------------
 net/sunrpc/auth_gss/auth_gss.c |  4 ++--
 net/sunrpc/auth_unix.c         |  2 +-
 net/sunrpc/clnt.c              |  2 +-
 8 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index cd3ca7b5d3db..7fcc78f2aa71 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -223,7 +223,7 @@ nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args)
 static int
 nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_auth;
+	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
 	unsigned int replen;
 	u32 offset = (u32)args->offset;
 	u32 count = args->count;
@@ -380,7 +380,7 @@ static int
 nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
 {
 	struct rpc_task	*task = req->rq_task;
-	struct rpc_auth	*auth = task->tk_auth;
+	struct rpc_auth	*auth = task->tk_msg.rpc_cred->cr_auth;
 	unsigned int replen;
 	u32 count = args->count;
 
@@ -541,7 +541,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
 static int
 nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
 {
-	struct rpc_auth *auth = req->rq_task->tk_auth;
+	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
 	unsigned int replen;
 
 	p = xdr_encode_fhandle(p, args->fh);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index b51df8eb9f01..b4647a22f349 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -319,7 +319,7 @@ nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *arg
 static int
 nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_auth;
+	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
 	unsigned int replen;
 	u32 count = args->count;
 
@@ -458,7 +458,7 @@ nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
 static int
 nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_auth;
+	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
 	unsigned int replen;
 	u32 count = args->count;
 
@@ -643,7 +643,7 @@ static int
 nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
 		    struct nfs3_getaclargs *args)
 {
-	struct rpc_auth *auth = req->rq_task->tk_auth;
+	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
 	unsigned int replen;
 
 	p = xdr_encode_fhandle(p, args->fh);
@@ -773,7 +773,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
 static int
 nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
 {
-	struct rpc_auth *auth = req->rq_task->tk_auth;
+	struct rpc_auth	*auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
 	unsigned int replen;
 
 	p = xdr_encode_fhandle(p, args->fh);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1fcca516e6ee..859b13633258 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1071,7 +1071,7 @@ static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args)
 
 static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req)
 {
-	struct rpc_auth *auth = req->rq_task->tk_auth;
+	struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
 	uint32_t attrs[2] = {
 		FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID,
 		FATTR4_WORD1_MOUNTED_ON_FILEID,
@@ -1117,7 +1117,7 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 
 static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req)
 {
-	struct rpc_auth *auth = req->rq_task->tk_auth;
+	struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
 	unsigned int replen;
 	__be32 *p;
 
@@ -1735,7 +1735,7 @@ out:
  */
 static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
-	struct rpc_auth	*auth = req->rq_task->tk_auth;
+	struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.nops = 2,
@@ -1795,7 +1795,7 @@ nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
 		struct nfs_getaclargs *args)
 {
 	struct xdr_stream xdr;
-	struct rpc_auth *auth = req->rq_task->tk_auth;
+	struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
 	struct compound_hdr hdr = {
 		.nops   = 2,
 	};
@@ -2030,7 +2030,7 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
 	struct compound_hdr hdr = {
 		.nops = 3,
 	};
-	struct rpc_auth *auth = req->rq_task->tk_auth;
+	struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
 	int replen;
 	int status;
 
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 3387b008cdfc..8ea077db0099 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -98,7 +98,6 @@ struct rpc_task {
 	unsigned short		tk_pid;		/* debugging aid */
 #endif
 };
-#define tk_auth			tk_client->cl_auth
 #define tk_xprt			tk_client->cl_xprt
 
 /* support walking a list of tasks on a wait queue */
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 81f4c776c558..74baf87ccff9 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -371,7 +371,7 @@ EXPORT_SYMBOL(rpcauth_init_cred);
 struct rpc_cred *
 rpcauth_bindcred(struct rpc_task *task)
 {
-	struct rpc_auth *auth = task->tk_auth;
+	struct rpc_auth *auth = task->tk_client->cl_auth;
 	struct auth_cred acred = {
 		.uid = current->fsuid,
 		.gid = current->fsgid,
@@ -381,7 +381,7 @@ rpcauth_bindcred(struct rpc_task *task)
 	int flags = 0;
 
 	dprintk("RPC: %5u looking up %s cred\n",
-		task->tk_pid, task->tk_auth->au_ops->au_name);
+		task->tk_pid, task->tk_client->cl_auth->au_ops->au_name);
 	get_group_info(acred.group_info);
 	if (task->tk_flags & RPC_TASK_ROOTCREDS)
 		flags |= RPCAUTH_LOOKUP_ROOTCREDS;
@@ -397,11 +397,12 @@ rpcauth_bindcred(struct rpc_task *task)
 void
 rpcauth_holdcred(struct rpc_task *task)
 {
-	dprintk("RPC: %5u holding %s cred %p\n",
-		task->tk_pid, task->tk_auth->au_ops->au_name,
-		task->tk_msg.rpc_cred);
-	if (task->tk_msg.rpc_cred)
-		get_rpccred(task->tk_msg.rpc_cred);
+	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	if (cred != NULL) {
+		get_rpccred(cred);
+		dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid,
+				cred->cr_auth->au_ops->au_name, cred);
+	}
 }
 
 void
@@ -441,7 +442,7 @@ rpcauth_unbindcred(struct rpc_task *task)
 	struct rpc_cred	*cred = task->tk_msg.rpc_cred;
 
 	dprintk("RPC: %5u releasing %s cred %p\n",
-		task->tk_pid, task->tk_auth->au_ops->au_name, cred);
+		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
 
 	put_rpccred(cred);
 	task->tk_msg.rpc_cred = NULL;
@@ -453,7 +454,7 @@ rpcauth_marshcred(struct rpc_task *task, __be32 *p)
 	struct rpc_cred	*cred = task->tk_msg.rpc_cred;
 
 	dprintk("RPC: %5u marshaling %s cred %p\n",
-		task->tk_pid, task->tk_auth->au_ops->au_name, cred);
+		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
 
 	return cred->cr_ops->crmarshal(task, p);
 }
@@ -464,7 +465,7 @@ rpcauth_checkverf(struct rpc_task *task, __be32 *p)
 	struct rpc_cred	*cred = task->tk_msg.rpc_cred;
 
 	dprintk("RPC: %5u validating %s cred %p\n",
-		task->tk_pid, task->tk_auth->au_ops->au_name, cred);
+		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
 
 	return cred->cr_ops->crvalidate(task, p);
 }
@@ -505,7 +506,7 @@ rpcauth_refreshcred(struct rpc_task *task)
 	int err;
 
 	dprintk("RPC: %5u refreshing %s cred %p\n",
-		task->tk_pid, task->tk_auth->au_ops->au_name, cred);
+		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
 
 	err = cred->cr_ops->crrefresh(task);
 	if (err < 0)
@@ -519,7 +520,7 @@ rpcauth_invalcred(struct rpc_task *task)
 	struct rpc_cred *cred = task->tk_msg.rpc_cred;
 
 	dprintk("RPC: %5u invalidating %s cred %p\n",
-		task->tk_pid, task->tk_auth->au_ops->au_name, cred);
+		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
 	if (cred)
 		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
 }
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 15da6f82db36..debcda86467c 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -906,7 +906,7 @@ gss_validate(struct rpc_task *task, __be32 *p)
 		goto out_bad;
 	/* We leave it to unwrap to calculate au_rslack. For now we just
 	 * calculate the length of the verifier: */
-	task->tk_auth->au_verfsize = XDR_QUADLEN(len) + 2;
+	cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2;
 	gss_put_ctx(ctx);
 	dprintk("RPC: %5u gss_validate: gss_verify_mic succeeded.\n",
 			task->tk_pid);
@@ -1206,7 +1206,7 @@ gss_unwrap_resp(struct rpc_task *task,
 			break;
 	}
 	/* take into account extra slack for integrity and privacy cases: */
-	task->tk_auth->au_rslack = task->tk_auth->au_verfsize + (p - savedp)
+	cred->cr_auth->au_rslack = cred->cr_auth->au_verfsize + (p - savedp)
 						+ (savedlen - head->iov_len);
 out_decode:
 	status = decode(rqstp, p, obj);
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index d9c50d810d15..5ed91e5bcee4 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -204,7 +204,7 @@ unx_validate(struct rpc_task *task, __be32 *p)
 		printk("RPC: giant verf size: %u\n", size);
 		return NULL;
 	}
-	task->tk_auth->au_rslack = (size >> 2) + 2;
+	task->tk_msg.rpc_cred->cr_auth->au_rslack = (size >> 2) + 2;
 	p += (size >> 2);
 
 	return p;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 28a789419f64..50af8bbe7f20 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -787,7 +787,7 @@ call_reserveresult(struct rpc_task *task)
 static void
 call_allocate(struct rpc_task *task)
 {
-	unsigned int slack = task->tk_auth->au_cslack;
+	unsigned int slack = task->tk_msg.rpc_cred->cr_auth->au_cslack;
 	struct rpc_rqst *req = task->tk_rqstp;
 	struct rpc_xprt *xprt = task->tk_xprt;
 	struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
-- 
cgit v1.3-8-gc7d7


From 587142f85f796cf0b823dd3080e815f02ff6b952 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 2 Jul 2007 09:57:54 -0400
Subject: NFS: Replace NFS_I(inode)->req_lock with inode->i_lock

There is no justification for keeping a special spinlock for the exclusive
use of the NFS writeback code.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         |  1 -
 fs/nfs/pagelist.c      | 11 ++++---
 fs/nfs/write.c         | 84 ++++++++++++++++++++++++--------------------------
 include/linux/nfs_fs.h |  1 -
 4 files changed, 46 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 01fc8ab0c562..9d5124166d20 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1154,7 +1154,6 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 	struct nfs_inode *nfsi = (struct nfs_inode *) foo;
 
 	inode_init_once(&nfsi->vfs_inode);
-	spin_lock_init(&nfsi->req_lock);
 	INIT_LIST_HEAD(&nfsi->open_files);
 	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 8d2642f24b8f..f56dae5216f4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -126,12 +126,13 @@ static int nfs_set_page_tag_locked(struct nfs_page *req)
  */
 void nfs_clear_page_tag_locked(struct nfs_page *req)
 {
-	struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
+	struct inode *inode = req->wb_context->path.dentry->d_inode;
+	struct nfs_inode *nfsi = NFS_I(inode);
 
 	if (req->wb_page != NULL) {
-		spin_lock(&nfsi->req_lock);
+		spin_lock(&inode->i_lock);
 		radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
-		spin_unlock(&nfsi->req_lock);
+		spin_unlock(&inode->i_lock);
 	}
 	nfs_unlock_request(req);
 }
@@ -390,7 +391,7 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
  * If the number of requests is set to 0, the entire address_space
  * starting at index idx_start, is scanned.
  * The requests are *not* checked to ensure that they form a contiguous set.
- * You must be holding the inode's req_lock when calling this function
+ * You must be holding the inode's i_lock when calling this function
  */
 int nfs_scan_list(struct nfs_inode *nfsi,
 		struct list_head *dst, pgoff_t idx_start,
@@ -430,7 +431,7 @@ int nfs_scan_list(struct nfs_inode *nfsi,
 			}
 		}
 		/* for latency reduction */
-		cond_resched_lock(&nfsi->req_lock);
+		cond_resched_lock(&nfsi->vfs_inode.i_lock);
 	}
 out:
 	return res;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9ef9ec746bfb..73ac992ece85 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -124,12 +124,12 @@ static struct nfs_page *nfs_page_find_request_locked(struct page *page)
 
 static struct nfs_page *nfs_page_find_request(struct page *page)
 {
+	struct inode *inode = page->mapping->host;
 	struct nfs_page *req = NULL;
-	spinlock_t *req_lock = &NFS_I(page->mapping->host)->req_lock;
 
-	spin_lock(req_lock);
+	spin_lock(&inode->i_lock);
 	req = nfs_page_find_request_locked(page);
-	spin_unlock(req_lock);
+	spin_unlock(&inode->i_lock);
 	return req;
 }
 
@@ -251,16 +251,16 @@ static void nfs_end_page_writeback(struct page *page)
 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 				struct page *page)
 {
+	struct inode *inode = page->mapping->host;
+	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_page *req;
-	struct nfs_inode *nfsi = NFS_I(page->mapping->host);
-	spinlock_t *req_lock = &nfsi->req_lock;
 	int ret;
 
-	spin_lock(req_lock);
+	spin_lock(&inode->i_lock);
 	for(;;) {
 		req = nfs_page_find_request_locked(page);
 		if (req == NULL) {
-			spin_unlock(req_lock);
+			spin_unlock(&inode->i_lock);
 			return 1;
 		}
 		if (nfs_lock_request_dontget(req))
@@ -270,28 +270,28 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 		 *	 succeed provided that someone hasn't already marked the
 		 *	 request as dirty (in which case we don't care).
 		 */
-		spin_unlock(req_lock);
+		spin_unlock(&inode->i_lock);
 		ret = nfs_wait_on_request(req);
 		nfs_release_request(req);
 		if (ret != 0)
 			return ret;
-		spin_lock(req_lock);
+		spin_lock(&inode->i_lock);
 	}
 	if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
 		/* This request is marked for commit */
-		spin_unlock(req_lock);
+		spin_unlock(&inode->i_lock);
 		nfs_unlock_request(req);
 		nfs_pageio_complete(pgio);
 		return 1;
 	}
 	if (nfs_set_page_writeback(page) != 0) {
-		spin_unlock(req_lock);
+		spin_unlock(&inode->i_lock);
 		BUG();
 	}
 	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
 			NFS_PAGE_TAG_LOCKED);
 	ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
-	spin_unlock(req_lock);
+	spin_unlock(&inode->i_lock);
 	nfs_pageio_add_request(pgio, req);
 	return ret;
 }
@@ -412,7 +412,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 
 	BUG_ON (!NFS_WBACK_BUSY(req));
 
-	spin_lock(&nfsi->req_lock);
+	spin_lock(&inode->i_lock);
 	set_page_private(req->wb_page, 0);
 	ClearPagePrivate(req->wb_page);
 	radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
@@ -420,11 +420,11 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 		__set_page_dirty_nobuffers(req->wb_page);
 	nfsi->npages--;
 	if (!nfsi->npages) {
-		spin_unlock(&nfsi->req_lock);
+		spin_unlock(&inode->i_lock);
 		nfs_end_data_update(inode);
 		iput(inode);
 	} else
-		spin_unlock(&nfsi->req_lock);
+		spin_unlock(&inode->i_lock);
 	nfs_clear_request(req);
 	nfs_release_request(req);
 }
@@ -458,13 +458,13 @@ nfs_mark_request_commit(struct nfs_page *req)
 	struct inode *inode = req->wb_context->path.dentry->d_inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	spin_lock(&nfsi->req_lock);
+	spin_lock(&inode->i_lock);
 	nfsi->ncommit++;
 	set_bit(PG_NEED_COMMIT, &(req)->wb_flags);
 	radix_tree_tag_set(&nfsi->nfs_page_tree,
 			req->wb_index,
 			NFS_PAGE_TAG_COMMIT);
-	spin_unlock(&nfsi->req_lock);
+	spin_unlock(&inode->i_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 }
@@ -534,10 +534,10 @@ static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, u
 		BUG_ON(!NFS_WBACK_BUSY(req));
 
 		kref_get(&req->wb_kref);
-		spin_unlock(&nfsi->req_lock);
+		spin_unlock(&inode->i_lock);
 		error = nfs_wait_on_request(req);
 		nfs_release_request(req);
-		spin_lock(&nfsi->req_lock);
+		spin_lock(&inode->i_lock);
 		if (error < 0)
 			return error;
 		res++;
@@ -602,7 +602,6 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
 {
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
-	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_page		*req, *new = NULL;
 	pgoff_t		rqend, end;
 
@@ -612,13 +611,13 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
 		/* Loop over all inode entries and see if we find
 		 * A request for the page we wish to update
 		 */
-		spin_lock(&nfsi->req_lock);
+		spin_lock(&inode->i_lock);
 		req = nfs_page_find_request_locked(page);
 		if (req) {
 			if (!nfs_lock_request_dontget(req)) {
 				int error;
 
-				spin_unlock(&nfsi->req_lock);
+				spin_unlock(&inode->i_lock);
 				error = nfs_wait_on_request(req);
 				nfs_release_request(req);
 				if (error < 0) {
@@ -628,7 +627,7 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
 				}
 				continue;
 			}
-			spin_unlock(&nfsi->req_lock);
+			spin_unlock(&inode->i_lock);
 			if (new)
 				nfs_release_request(new);
 			break;
@@ -639,14 +638,14 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
 			nfs_lock_request_dontget(new);
 			error = nfs_inode_add_request(inode, new);
 			if (error) {
-				spin_unlock(&nfsi->req_lock);
+				spin_unlock(&inode->i_lock);
 				nfs_unlock_request(new);
 				return ERR_PTR(error);
 			}
-			spin_unlock(&nfsi->req_lock);
+			spin_unlock(&inode->i_lock);
 			return new;
 		}
-		spin_unlock(&nfsi->req_lock);
+		spin_unlock(&inode->i_lock);
 
 		new = nfs_create_request(ctx, inode, page, offset, bytes);
 		if (IS_ERR(new))
@@ -974,9 +973,9 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
 	}
 
 	if (nfs_write_need_commit(data)) {
-		spinlock_t *req_lock = &NFS_I(page->mapping->host)->req_lock;
+		struct inode *inode = page->mapping->host;
 
-		spin_lock(req_lock);
+		spin_lock(&inode->i_lock);
 		if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) {
 			/* Do nothing we need to resend the writes */
 		} else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) {
@@ -987,7 +986,7 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
 			clear_bit(PG_NEED_COMMIT, &req->wb_flags);
 			dprintk(" server reboot detected\n");
 		}
-		spin_unlock(req_lock);
+		spin_unlock(&inode->i_lock);
 	} else
 		dprintk(" OK\n");
 
@@ -1277,13 +1276,12 @@ static const struct rpc_call_ops nfs_commit_ops = {
 
 int nfs_commit_inode(struct inode *inode, int how)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
 	LIST_HEAD(head);
 	int res;
 
-	spin_lock(&nfsi->req_lock);
+	spin_lock(&inode->i_lock);
 	res = nfs_scan_commit(inode, &head, 0, 0);
-	spin_unlock(&nfsi->req_lock);
+	spin_unlock(&inode->i_lock);
 	if (res) {
 		int error = nfs_commit_list(inode, &head, how);
 		if (error < 0)
@@ -1301,7 +1299,6 @@ static inline int nfs_commit_list(struct inode *inode, struct list_head *head, i
 long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
 {
 	struct inode *inode = mapping->host;
-	struct nfs_inode *nfsi = NFS_I(inode);
 	pgoff_t idx_start, idx_end;
 	unsigned int npages = 0;
 	LIST_HEAD(head);
@@ -1323,7 +1320,7 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
 		}
 	}
 	how &= ~FLUSH_NOCOMMIT;
-	spin_lock(&nfsi->req_lock);
+	spin_lock(&inode->i_lock);
 	do {
 		ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
 		if (ret != 0)
@@ -1334,18 +1331,19 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
 		if (pages == 0)
 			break;
 		if (how & FLUSH_INVALIDATE) {
-			spin_unlock(&nfsi->req_lock);
+			spin_unlock(&inode->i_lock);
 			nfs_cancel_commit_list(&head);
 			ret = pages;
-			spin_lock(&nfsi->req_lock);
+			spin_lock(&inode->i_lock);
 			continue;
 		}
 		pages += nfs_scan_commit(inode, &head, 0, 0);
-		spin_unlock(&nfsi->req_lock);
+		spin_unlock(&inode->i_lock);
 		ret = nfs_commit_list(inode, &head, how);
-		spin_lock(&nfsi->req_lock);
+		spin_lock(&inode->i_lock);
+
 	} while (ret >= 0);
-	spin_unlock(&nfsi->req_lock);
+	spin_unlock(&inode->i_lock);
 	return ret;
 }
 
@@ -1439,7 +1437,6 @@ int nfs_set_page_dirty(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 	struct inode *inode;
-	spinlock_t *req_lock;
 	struct nfs_page *req;
 	int ret;
 
@@ -1448,18 +1445,17 @@ int nfs_set_page_dirty(struct page *page)
 	inode = mapping->host;
 	if (!inode)
 		goto out_raced;
-	req_lock = &NFS_I(inode)->req_lock;
-	spin_lock(req_lock);
+	spin_lock(&inode->i_lock);
 	req = nfs_page_find_request_locked(page);
 	if (req != NULL) {
 		/* Mark any existing write requests for flushing */
 		ret = !test_and_set_bit(PG_NEED_FLUSH, &req->wb_flags);
-		spin_unlock(req_lock);
+		spin_unlock(&inode->i_lock);
 		nfs_release_request(req);
 		return ret;
 	}
 	ret = __set_page_dirty_nobuffers(page);
-	spin_unlock(req_lock);
+	spin_unlock(&inode->i_lock);
 	return ret;
 out_raced:
 	return !TestSetPageDirty(page);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index bf24151d63be..cf395351cdd4 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -156,7 +156,6 @@ struct nfs_inode {
 	/*
 	 * This is the list of dirty unwritten pages.
 	 */
-	spinlock_t		req_lock;
 	struct radix_tree_root	nfs_page_tree;
 
 	unsigned long		ncommit,
-- 
cgit v1.3-8-gc7d7


From 7af654f8d1b7460415af5d1d326233478dd0f563 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 2 Jul 2007 12:49:23 -0400
Subject: NFSv4: Don't reuse expired nfs4_state_owner structs

That just confuses certain NFSv4 servers.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           | 10 ----------
 fs/nfs/nfs4state.c        | 28 ----------------------------
 include/linux/nfs_fs_sb.h |  2 --
 3 files changed, 40 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 71d4c4cdac52..6b424407d631 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -131,7 +131,6 @@ static struct nfs_client *nfs_alloc_client(const char *hostname,
 	init_rwsem(&clp->cl_sem);
 	INIT_LIST_HEAD(&clp->cl_delegations);
 	INIT_LIST_HEAD(&clp->cl_state_owners);
-	INIT_LIST_HEAD(&clp->cl_unused);
 	spin_lock_init(&clp->cl_lock);
 	INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
 	rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -155,15 +154,6 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 #ifdef CONFIG_NFS_V4
 	if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
 		nfs4_kill_renewd(clp);
-	while (!list_empty(&clp->cl_unused)) {
-		struct nfs4_state_owner *sp;
-
-		sp = list_entry(clp->cl_unused.next,
-				struct nfs4_state_owner,
-				so_list);
-		list_del(&sp->so_list);
-		kfree(sp);
-	}
 	BUG_ON(!list_empty(&clp->cl_state_owners));
 	if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
 		nfs_idmap_delete(clp);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0030248d63ec..2b00c45aebea 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -75,21 +75,6 @@ nfs4_alloc_lockowner_id(struct nfs_client *clp)
 	return clp->cl_lockowner_id ++;
 }
 
-static struct nfs4_state_owner *
-nfs4_client_grab_unused(struct nfs_client *clp, struct rpc_cred *cred)
-{
-	struct nfs4_state_owner *sp = NULL;
-
-	if (!list_empty(&clp->cl_unused)) {
-		sp = list_entry(clp->cl_unused.next, struct nfs4_state_owner, so_list);
-		atomic_inc(&sp->so_count);
-		sp->so_cred = get_rpccred(cred);
-		list_move(&sp->so_list, &clp->cl_state_owners);
-		clp->cl_nunused--;
-	}
-	return sp;
-}
-
 struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
 {
 	struct nfs4_state_owner *sp;
@@ -178,8 +163,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
 	new = nfs4_alloc_state_owner();
 	spin_lock(&clp->cl_lock);
 	sp = nfs4_find_state_owner(clp, cred);
-	if (sp == NULL)
-		sp = nfs4_client_grab_unused(clp, cred);
 	if (sp == NULL && new != NULL) {
 		list_add(&new->so_list, &clp->cl_state_owners);
 		new->so_client = clp;
@@ -206,17 +189,6 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 
 	if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
 		return;
-	if (clp->cl_nunused >= OPENOWNER_POOL_SIZE)
-		goto out_free;
-	if (list_empty(&sp->so_list))
-		goto out_free;
-	list_move(&sp->so_list, &clp->cl_unused);
-	clp->cl_nunused++;
-	spin_unlock(&clp->cl_lock);
-	put_rpccred(cred);
-	cred = NULL;
-	return;
-out_free:
 	list_del(&sp->so_list);
 	spin_unlock(&clp->cl_lock);
 	put_rpccred(cred);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 144d955dc46a..2cef0a68aa77 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -44,8 +44,6 @@ struct nfs_client {
 
 	struct list_head	cl_delegations;
 	struct list_head	cl_state_owners;
-	struct list_head	cl_unused;
-	int			cl_nunused;
 	spinlock_t		cl_lock;
 
 	unsigned long		cl_lease_time;
-- 
cgit v1.3-8-gc7d7


From 9f958ab8858c75df800e0121b1920182820cbc39 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 2 Jul 2007 13:58:33 -0400
Subject: NFSv4: Reduce the chances of an open_owner identifier collision

Currently we just use a 32-bit counter.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           |   3 +-
 fs/nfs/nfs4_fs.h          |  17 ++--
 fs/nfs/nfs4proc.c         |   8 +-
 fs/nfs/nfs4state.c        | 193 ++++++++++++++++++++++++++++++++++++----------
 fs/nfs/nfs4xdr.c          |  27 ++++---
 include/linux/nfs_fs_sb.h |   5 +-
 include/linux/nfs_xdr.h   |   4 +-
 7 files changed, 190 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 6b424407d631..ccb455053ee4 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -130,7 +130,6 @@ static struct nfs_client *nfs_alloc_client(const char *hostname,
 #ifdef CONFIG_NFS_V4
 	init_rwsem(&clp->cl_sem);
 	INIT_LIST_HEAD(&clp->cl_delegations);
-	INIT_LIST_HEAD(&clp->cl_state_owners);
 	spin_lock_init(&clp->cl_lock);
 	INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
 	rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -154,7 +153,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
 #ifdef CONFIG_NFS_V4
 	if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
 		nfs4_kill_renewd(clp);
-	BUG_ON(!list_empty(&clp->cl_state_owners));
+	BUG_ON(!RB_EMPTY_ROOT(&clp->cl_state_owners));
 	if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
 		nfs_idmap_delete(clp);
 #endif
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index c97a0ad8430e..44b56c915f72 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -70,19 +70,25 @@ static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status
 		seqid->flags |= NFS_SEQID_CONFIRMED;
 }
 
+struct nfs_unique_id {
+	struct rb_node rb_node;
+	__u64 id;
+};
+
 /*
  * NFS4 state_owners and lock_owners are simply labels for ordered
  * sequences of RPC calls. Their sole purpose is to provide once-only
  * semantics by allowing the server to identify replayed requests.
  */
 struct nfs4_state_owner {
-	spinlock_t	     so_lock;
-	struct list_head     so_list;	 /* per-clientid list of state_owners */
+	struct nfs_unique_id so_owner_id;
 	struct nfs_client    *so_client;
-	u32                  so_id;      /* 32-bit identifier, unique */
-	atomic_t	     so_count;
+	struct rb_node	     so_client_node;
 
 	struct rpc_cred	     *so_cred;	 /* Associated cred */
+
+	spinlock_t	     so_lock;
+	atomic_t	     so_count;
 	struct list_head     so_states;
 	struct list_head     so_delegations;
 	struct nfs_seqid_counter so_seqid;
@@ -108,7 +114,7 @@ struct nfs4_lock_state {
 #define NFS_LOCK_INITIALIZED 1
 	int			ls_flags;
 	struct nfs_seqid_counter	ls_seqid;
-	u32			ls_id;
+	struct nfs_unique_id	ls_id;
 	nfs4_stateid		ls_stateid;
 	atomic_t		ls_count;
 };
@@ -189,7 +195,6 @@ extern void nfs4_renew_state(struct work_struct *);
 
 /* nfs4state.c */
 struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp);
-extern u32 nfs4_alloc_lockowner_id(struct nfs_client *);
 
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
 extern void nfs4_put_state_owner(struct nfs4_state_owner *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 84d0b7e0dd67..1840ebc78fd3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -253,7 +253,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
 	p->o_arg.fh = NFS_FH(dir);
 	p->o_arg.open_flags = flags,
 	p->o_arg.clientid = server->nfs_client->cl_clientid;
-	p->o_arg.id = sp->so_id;
+	p->o_arg.id = sp->so_owner_id.id;
 	p->o_arg.name = &p->path.dentry->d_name;
 	p->o_arg.server = server;
 	p->o_arg.bitmask = server->attr_bitmask;
@@ -651,7 +651,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 	if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
 		return;
 	/* Update sequence id. */
-	data->o_arg.id = sp->so_id;
+	data->o_arg.id = sp->so_owner_id.id;
 	data->o_arg.clientid = sp->so_client->cl_clientid;
 	if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS)
 		msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
@@ -3029,7 +3029,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 	if (status != 0)
 		goto out;
 	lsp = request->fl_u.nfs4_fl.owner;
-	arg.lock_owner.id = lsp->ls_id; 
+	arg.lock_owner.id = lsp->ls_id.id;
 	status = rpc_call_sync(server->client, &msg, 0);
 	switch (status) {
 		case 0:
@@ -3243,7 +3243,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
 		goto out_free;
 	p->arg.lock_stateid = &lsp->ls_stateid;
 	p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
-	p->arg.lock_owner.id = lsp->ls_id;
+	p->arg.lock_owner.id = lsp->ls_id.id;
 	p->lsp = lsp;
 	atomic_inc(&lsp->ls_count);
 	p->ctx = get_nfs_open_context(ctx);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0f79d56e97f0..ab0b5ab60e60 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -44,6 +44,7 @@
 #include <linux/nfs_idmap.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
+#include <linux/random.h>
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
 
@@ -69,18 +70,14 @@ static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
 	return status;
 }
 
-u32
-nfs4_alloc_lockowner_id(struct nfs_client *clp)
-{
-	return clp->cl_lockowner_id ++;
-}
-
 struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
 {
 	struct nfs4_state_owner *sp;
+	struct rb_node *pos;
 	struct rpc_cred *cred = NULL;
 
-	list_for_each_entry(sp, &clp->cl_state_owners, so_list) {
+	for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
 		if (list_empty(&sp->so_states))
 			continue;
 		cred = get_rpccred(sp->so_cred);
@@ -92,32 +89,129 @@ struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
 static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
 {
 	struct nfs4_state_owner *sp;
+	struct rb_node *pos;
 
-	if (!list_empty(&clp->cl_state_owners)) {
-		sp = list_entry(clp->cl_state_owners.next,
-				struct nfs4_state_owner, so_list);
+	pos = rb_first(&clp->cl_state_owners);
+	if (pos != NULL) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
 		return get_rpccred(sp->so_cred);
 	}
 	return NULL;
 }
 
+static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new,
+		__u64 minval, int maxbits)
+{
+	struct rb_node **p, *parent;
+	struct nfs_unique_id *pos;
+	__u64 mask = ~0ULL;
+
+	if (maxbits < 64)
+		mask = (1ULL << maxbits) - 1ULL;
+
+	/* Ensure distribution is more or less flat */
+	get_random_bytes(&new->id, sizeof(new->id));
+	new->id &= mask;
+	if (new->id < minval)
+		new->id += minval;
+retry:
+	p = &root->rb_node;
+	parent = NULL;
+
+	while (*p != NULL) {
+		parent = *p;
+		pos = rb_entry(parent, struct nfs_unique_id, rb_node);
+
+		if (new->id < pos->id)
+			p = &(*p)->rb_left;
+		else if (new->id > pos->id)
+			p = &(*p)->rb_right;
+		else
+			goto id_exists;
+	}
+	rb_link_node(&new->rb_node, parent, p);
+	rb_insert_color(&new->rb_node, root);
+	return;
+id_exists:
+	for (;;) {
+		new->id++;
+		if (new->id < minval || (new->id & mask) != new->id) {
+			new->id = minval;
+			break;
+		}
+		parent = rb_next(parent);
+		if (parent == NULL)
+			break;
+		pos = rb_entry(parent, struct nfs_unique_id, rb_node);
+		if (new->id < pos->id)
+			break;
+	}
+	goto retry;
+}
+
+static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id)
+{
+	rb_erase(&id->rb_node, root);
+}
+
 static struct nfs4_state_owner *
 nfs4_find_state_owner(struct nfs_client *clp, struct rpc_cred *cred)
 {
+	struct rb_node **p = &clp->cl_state_owners.rb_node,
+		       *parent = NULL;
 	struct nfs4_state_owner *sp, *res = NULL;
 
-	list_for_each_entry(sp, &clp->cl_state_owners, so_list) {
-		if (sp->so_cred != cred)
-			continue;
-		atomic_inc(&sp->so_count);
-		/* Move to the head of the list */
-		list_move(&sp->so_list, &clp->cl_state_owners);
-		res = sp;
-		break;
+	while (*p != NULL) {
+		parent = *p;
+		sp = rb_entry(parent, struct nfs4_state_owner, so_client_node);
+
+		if (cred < sp->so_cred)
+			p = &parent->rb_left;
+		else if (cred > sp->so_cred)
+			p = &parent->rb_right;
+		else {
+			atomic_inc(&sp->so_count);
+			res = sp;
+			break;
+		}
 	}
 	return res;
 }
 
+static struct nfs4_state_owner *
+nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new)
+{
+	struct rb_node **p = &clp->cl_state_owners.rb_node,
+		       *parent = NULL;
+	struct nfs4_state_owner *sp;
+
+	while (*p != NULL) {
+		parent = *p;
+		sp = rb_entry(parent, struct nfs4_state_owner, so_client_node);
+
+		if (new->so_cred < sp->so_cred)
+			p = &parent->rb_left;
+		else if (new->so_cred > sp->so_cred)
+			p = &parent->rb_right;
+		else {
+			atomic_inc(&sp->so_count);
+			return sp;
+		}
+	}
+	nfs_alloc_unique_id(&clp->cl_openowner_id, &new->so_owner_id, 1, 64);
+	rb_link_node(&new->so_client_node, parent, p);
+	rb_insert_color(&new->so_client_node, &clp->cl_state_owners);
+	return new;
+}
+
+static void
+nfs4_remove_state_owner(struct nfs_client *clp, struct nfs4_state_owner *sp)
+{
+	if (!RB_EMPTY_NODE(&sp->so_client_node))
+		rb_erase(&sp->so_client_node, &clp->cl_state_owners);
+	nfs_free_unique_id(&clp->cl_openowner_id, &sp->so_owner_id);
+}
+
 /*
  * nfs4_alloc_state_owner(): this is called on the OPEN or CREATE path to
  * create a new state_owner.
@@ -145,10 +239,14 @@ nfs4_alloc_state_owner(void)
 void
 nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 {
-	struct nfs_client *clp = sp->so_client;
-	spin_lock(&clp->cl_lock);
-	list_del_init(&sp->so_list);
-	spin_unlock(&clp->cl_lock);
+	if (!RB_EMPTY_NODE(&sp->so_client_node)) {
+		struct nfs_client *clp = sp->so_client;
+
+		spin_lock(&clp->cl_lock);
+		rb_erase(&sp->so_client_node, &clp->cl_state_owners);
+		RB_CLEAR_NODE(&sp->so_client_node);
+		spin_unlock(&clp->cl_lock);
+	}
 }
 
 /*
@@ -160,22 +258,24 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_state_owner *sp, *new;
 
-	new = nfs4_alloc_state_owner();
 	spin_lock(&clp->cl_lock);
 	sp = nfs4_find_state_owner(clp, cred);
-	if (sp == NULL && new != NULL) {
-		list_add(&new->so_list, &clp->cl_state_owners);
-		new->so_client = clp;
-		new->so_id = nfs4_alloc_lockowner_id(clp);
-		new->so_cred = get_rpccred(cred);
-		sp = new;
-		new = NULL;
-	}
 	spin_unlock(&clp->cl_lock);
-	kfree(new);
 	if (sp != NULL)
 		return sp;
-	return NULL;
+	new = nfs4_alloc_state_owner();
+	if (new == NULL)
+		return NULL;
+	new->so_client = clp;
+	new->so_cred = cred;
+	spin_lock(&clp->cl_lock);
+	sp = nfs4_insert_state_owner(clp, new);
+	spin_unlock(&clp->cl_lock);
+	if (sp == new)
+		get_rpccred(cred);
+	else
+		kfree(new);
+	return sp;
 }
 
 /*
@@ -189,7 +289,7 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 
 	if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
 		return;
-	list_del(&sp->so_list);
+	nfs4_remove_state_owner(clp, sp);
 	spin_unlock(&clp->cl_lock);
 	put_rpccred(cred);
 	kfree(sp);
@@ -386,12 +486,22 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 	atomic_set(&lsp->ls_count, 1);
 	lsp->ls_owner = fl_owner;
 	spin_lock(&clp->cl_lock);
-	lsp->ls_id = nfs4_alloc_lockowner_id(clp);
+	nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
 	spin_unlock(&clp->cl_lock);
 	INIT_LIST_HEAD(&lsp->ls_locks);
 	return lsp;
 }
 
+static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
+{
+	struct nfs_client *clp = lsp->ls_state->owner->so_client;
+
+	spin_lock(&clp->cl_lock);
+	nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id);
+	spin_unlock(&clp->cl_lock);
+	kfree(lsp);
+}
+
 /*
  * Return a compatible lock_state. If no initialized lock_state structure
  * exists, return an uninitialized one.
@@ -421,7 +531,8 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
 			return NULL;
 	}
 	spin_unlock(&state->state_lock);
-	kfree(new);
+	if (new != NULL)
+		nfs4_free_lock_state(new);
 	return lsp;
 }
 
@@ -442,7 +553,7 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
 	if (list_empty(&state->lock_states))
 		clear_bit(LK_STATE_IN_USE, &state->flags);
 	spin_unlock(&state->state_lock);
-	kfree(lsp);
+	nfs4_free_lock_state(lsp);
 }
 
 static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
@@ -719,11 +830,13 @@ out_err:
 static void nfs4_state_mark_reclaim(struct nfs_client *clp)
 {
 	struct nfs4_state_owner *sp;
+	struct rb_node *pos;
 	struct nfs4_state *state;
 	struct nfs4_lock_state *lock;
 
 	/* Reset all sequence ids to zero */
-	list_for_each_entry(sp, &clp->cl_state_owners, so_list) {
+	for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
 		sp->so_seqid.counter = 0;
 		sp->so_seqid.flags = 0;
 		spin_lock(&sp->so_lock);
@@ -742,6 +855,7 @@ static int reclaimer(void *ptr)
 {
 	struct nfs_client *clp = ptr;
 	struct nfs4_state_owner *sp;
+	struct rb_node *pos;
 	struct nfs4_state_recovery_ops *ops;
 	struct rpc_cred *cred;
 	int status = 0;
@@ -787,7 +901,8 @@ restart_loop:
 	/* Mark all delegations for reclaim */
 	nfs_delegation_mark_reclaim(clp);
 	/* Note: list is protected by exclusive lock on cl->cl_sem */
-	list_for_each_entry(sp, &clp->cl_state_owners, so_list) {
+	for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
 		status = nfs4_reclaim_open_state(ops, sp);
 		if (status < 0) {
 			if (status == -NFS4ERR_NO_GRACE) {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4c8f67d47523..c08738441f73 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -68,10 +68,10 @@ static int nfs4_stat_to_errno(int);
 #endif
 
 /* lock,open owner id: 
- * we currently use size 1 (u32) out of (NFS4_OPAQUE_LIMIT  >> 2)
+ * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT  >> 2)
  */
-#define open_owner_id_maxsz	(1 + 1)
-#define lock_owner_id_maxsz	(1 + 1)
+#define open_owner_id_maxsz	(1 + 4)
+#define lock_owner_id_maxsz	(1 + 4)
 #define compound_encode_hdr_maxsz	(3 + (NFS4_MAXTAGLEN >> 2))
 #define compound_decode_hdr_maxsz	(3 + (NFS4_MAXTAGLEN >> 2))
 #define op_encode_hdr_maxsz	(1)
@@ -827,13 +827,14 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
 	WRITE64(nfs4_lock_length(args->fl));
 	WRITE32(args->new_lock_owner);
 	if (args->new_lock_owner){
-		RESERVE_SPACE(4+NFS4_STATEID_SIZE+20);
+		RESERVE_SPACE(4+NFS4_STATEID_SIZE+32);
 		WRITE32(args->open_seqid->sequence->counter);
 		WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE);
 		WRITE32(args->lock_seqid->sequence->counter);
 		WRITE64(args->lock_owner.clientid);
-		WRITE32(4);
-		WRITE32(args->lock_owner.id);
+		WRITE32(16);
+		WRITEMEM("lock id:", 8);
+		WRITE64(args->lock_owner.id);
 	}
 	else {
 		RESERVE_SPACE(NFS4_STATEID_SIZE+4);
@@ -848,14 +849,15 @@ static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *arg
 {
 	__be32 *p;
 
-	RESERVE_SPACE(40);
+	RESERVE_SPACE(52);
 	WRITE32(OP_LOCKT);
 	WRITE32(nfs4_lock_type(args->fl, 0));
 	WRITE64(args->fl->fl_start);
 	WRITE64(nfs4_lock_length(args->fl));
 	WRITE64(args->lock_owner.clientid);
-	WRITE32(4);
-	WRITE32(args->lock_owner.id);
+	WRITE32(16);
+	WRITEMEM("lock id:", 8);
+	WRITE64(args->lock_owner.id);
 
 	return 0;
 }
@@ -920,10 +922,11 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 	WRITE32(OP_OPEN);
 	WRITE32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->open_flags);
-	RESERVE_SPACE(16);
+	RESERVE_SPACE(28);
 	WRITE64(arg->clientid);
-	WRITE32(4);
-	WRITE32(arg->id);
+	WRITE32(16);
+	WRITEMEM("open id:", 8);
+	WRITE64(arg->id);
 }
 
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 2cef0a68aa77..0cac49bc0955 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -34,7 +34,8 @@ struct nfs_client {
 	nfs4_verifier		cl_confirm;
 	unsigned long		cl_state;
 
-	u32			cl_lockowner_id;
+	struct rb_root		cl_openowner_id;
+	struct rb_root		cl_lockowner_id;
 
 	/*
 	 * The following rwsem ensures exclusive access to the server
@@ -43,7 +44,7 @@ struct nfs_client {
 	struct rw_semaphore	cl_sem;
 
 	struct list_head	cl_delegations;
-	struct list_head	cl_state_owners;
+	struct rb_root		cl_state_owners;
 	spinlock_t		cl_lock;
 
 	unsigned long		cl_lease_time;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index f7100df3a690..38d77681cf27 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -119,7 +119,7 @@ struct nfs_openargs {
 	struct nfs_seqid *	seqid;
 	int			open_flags;
 	__u64                   clientid;
-	__u32                   id;
+	__u64                   id;
 	union {
 		struct iattr *  attrs;    /* UNCHECKED, GUARDED */
 		nfs4_verifier   verifier; /* EXCLUSIVE */
@@ -181,7 +181,7 @@ struct nfs_closeres {
  *   */
 struct nfs_lowner {
 	__u64			clientid;
-	u32			id;
+	__u64			id;
 };
 
 struct nfs_lock_args {
-- 
cgit v1.3-8-gc7d7


From 412c77cee6d6e73fbe1dc3d67f52163efed33fc4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jul 2007 16:10:55 -0400
Subject: NFSv4: Defer inode revalidation when setting up a delegation

Currently we force a synchronous call to __nfs_revalidate_inode() in
nfs_inode_set_delegation(). This not only ensures that we cannot call
nfs_inode_set_delegation from an asynchronous context, but it also slows
down any call to open().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/delegation.c    | 10 ++++++----
 fs/nfs/inode.c         |  4 +++-
 include/linux/nfs_fs.h |  1 +
 3 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 56f4f6a99d4e..20ac403469a0 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -124,10 +124,6 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 	struct nfs_delegation *delegation;
 	int status = 0;
 
-	/* Ensure we first revalidate the attributes and page cache! */
-	if ((nfsi->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATTR)))
-		__nfs_revalidate_inode(NFS_SERVER(inode), inode);
-
 	delegation = kmalloc(sizeof(*delegation), GFP_KERNEL);
 	if (delegation == NULL)
 		return -ENOMEM;
@@ -154,6 +150,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 			status = -EIO;
 		}
 	}
+
+	/* Ensure we revalidate the attributes and page cache! */
+	spin_lock(&inode->i_lock);
+	nfsi->cache_validity |= NFS_INO_REVAL_FORCED;
+	spin_unlock(&inode->i_lock);
+
 	spin_unlock(&clp->cl_lock);
 	kfree(delegation);
 	return status;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9d5124166d20..3d9fccf4ef93 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1072,8 +1072,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		invalid &= ~NFS_INO_INVALID_DATA;
 	if (data_stable)
 		invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME|NFS_INO_REVAL_PAGECACHE);
-	if (!nfs_have_delegation(inode, FMODE_READ))
+	if (!nfs_have_delegation(inode, FMODE_READ) ||
+			(nfsi->cache_validity & NFS_INO_REVAL_FORCED))
 		nfsi->cache_validity |= invalid;
+	nfsi->cache_validity &= ~NFS_INO_REVAL_FORCED;
 
 	return 0;
  out_changed:
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index cf395351cdd4..e94971040de9 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -184,6 +184,7 @@ struct nfs_inode {
 #define NFS_INO_INVALID_ACCESS	0x0008		/* cached access cred invalid */
 #define NFS_INO_INVALID_ACL	0x0010		/* cached acls are invalid */
 #define NFS_INO_REVAL_PAGECACHE	0x0020		/* must revalidate pagecache */
+#define NFS_INO_REVAL_FORCED	0x0040		/* force revalidation ignoring a delegation */
 
 /*
  * Bit offsets in flags field
-- 
cgit v1.3-8-gc7d7


From 433c92379d9c2c59c2ebc7628fe4fb02cfc2daf8 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 1 Jul 2007 12:12:14 -0400
Subject: NFS: Clean up nfs_size_to_loff_t()

Use the same file size limit that lockd uses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_fs.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index e94971040de9..7deb5b0347f7 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -503,12 +503,10 @@ extern int  nfsroot_mount(struct sockaddr_in *, char *, struct nfs_fh *,
  * inline functions
  */
 
-static inline loff_t
-nfs_size_to_loff_t(__u64 size)
+static inline loff_t nfs_size_to_loff_t(__u64 size)
 {
-	loff_t maxsz = (((loff_t) ULONG_MAX) << PAGE_CACHE_SHIFT) + PAGE_CACHE_SIZE - 1;
-	if (size > maxsz)
-		return maxsz;
+	if (size > (__u64) OFFSET_MAX - 1)
+		return OFFSET_MAX - 1;
 	return (loff_t) size;
 }
 
-- 
cgit v1.3-8-gc7d7


From 5680d48be88d12cd987e5579a6072a4ca34ca6ea Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 1 Jul 2007 12:12:24 -0400
Subject: NFS: Clean-up: Define macros for maximum host and export path name
 lengths

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c            | 4 ++--
 include/linux/nfs_mount.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 14c7923697d2..e7d197085834 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -867,12 +867,12 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 		}
 	}
 
-	p = nfs_copy_user_string(NULL, &data->hostname, 256);
+	p = nfs_copy_user_string(NULL, &data->hostname, NFS4_MAXNAMLEN);
 	if (IS_ERR(p))
 		goto out_err;
 	hostname = p;
 
-	p = nfs_copy_user_string(NULL, &data->mnt_path, 1024);
+	p = nfs_copy_user_string(NULL, &data->mnt_path, NFS4_MAXPATHLEN);
 	if (IS_ERR(p))
 		goto out_err;
 	mntpath = p;
diff --git a/include/linux/nfs_mount.h b/include/linux/nfs_mount.h
index cc8b9c59acb8..0b82a17c705b 100644
--- a/include/linux/nfs_mount.h
+++ b/include/linux/nfs_mount.h
@@ -37,7 +37,7 @@ struct nfs_mount_data {
 	int		acdirmin;		/* 1 */
 	int		acdirmax;		/* 1 */
 	struct sockaddr_in addr;		/* 1 */
-	char		hostname[256];		/* 1 */
+	char		hostname[NFS_MAXNAMLEN + 1];		/* 1 */
 	int		namlen;			/* 2 */
 	unsigned int	bsize;			/* 3 */
 	struct nfs3_fh	root;			/* 4 */
-- 
cgit v1.3-8-gc7d7


From f18289931d705f9c4634b361341a1677bea97aca Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 1 Jul 2007 12:12:51 -0400
Subject: NFS: Add a new NFS debugging flag just for mount processing

Note to self: fix up /usr/sbin/rpcdebug too

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 7deb5b0347f7..04f659f1e560 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -553,6 +553,7 @@ extern void * nfs_root_data(void);
 #define NFSDBG_ROOT		0x0080
 #define NFSDBG_CALLBACK		0x0100
 #define NFSDBG_CLIENT		0x0200
+#define NFSDBG_MOUNT		0x0400
 #define NFSDBG_ALL		0xFFFF
 
 #ifdef __KERNEL__
-- 
cgit v1.3-8-gc7d7


From cce63cd6374e6f1b4ea897ece1454feb13993d7c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 1 Jul 2007 12:13:12 -0400
Subject: SUNRPC: Rename rpcb_getport_external routine

In preparation for handling NFS mount option parsing in the kernel,
rename rpcb_getport_external as rpcb_get_port_sync, and make it available
always (instead of only when CONFIG_ROOT_NFS is enabled).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfsroot.c            |  2 +-
 include/linux/sunrpc/clnt.h |  7 ++-----
 net/sunrpc/rpcb_clnt.c      | 21 +++++++++++----------
 3 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 49d1008ce1d7..f0db4703b1c9 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -428,7 +428,7 @@ static int __init root_nfs_getport(int program, int version, int proto)
 	printk(KERN_NOTICE "Looking up port of RPC %d/%d on %u.%u.%u.%u\n",
 		program, version, NIPQUAD(servaddr));
 	set_sockaddr(&sin, servaddr, 0);
-	return rpcb_getport_external(&sin, program, version, proto);
+	return rpcb_getport_sync(&sin, program, version, proto);
 }
 
 
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 097984b03857..b28d919c7758 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -120,8 +120,10 @@ struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
 struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
 void		rpc_shutdown_client(struct rpc_clnt *);
 void		rpc_release_client(struct rpc_clnt *);
+
 int		rpcb_register(u32, u32, int, unsigned short, int *);
 void		rpcb_getport(struct rpc_task *);
+int		rpcb_getport_sync(struct sockaddr_in *, __u32, __u32, int);
 
 void		rpc_call_setup(struct rpc_task *, struct rpc_message *, int);
 
@@ -141,10 +143,5 @@ void		rpc_force_rebind(struct rpc_clnt *);
 size_t		rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t);
 char *		rpc_peeraddr2str(struct rpc_clnt *, enum rpc_display_format_t);
 
-/*
- * Helper function for NFSroot support
- */
-int		rpcb_getport_external(struct sockaddr_in *, __u32, __u32, int);
-
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_CLNT_H */
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 9a20f380ab09..fc881a675eb9 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -12,6 +12,8 @@
  *  Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
  */
 
+#include <linux/module.h>
+
 #include <linux/types.h>
 #include <linux/socket.h>
 #include <linux/kernel.h>
@@ -247,21 +249,20 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
 	return error;
 }
 
-#ifdef CONFIG_ROOT_NFS
 /**
- * rpcb_getport_external - obtain the port for an RPC service on a given host
+ * rpcb_getport_sync - obtain the port for an RPC service on a given host
  * @sin: address of remote peer
  * @prog: RPC program number to bind
  * @vers: RPC version number to bind
  * @prot: transport protocol to use to make this request
  *
  * Called from outside the RPC client in a synchronous task context.
+ * Uses default timeout parameters specified by underlying transport.
  *
- * For now, this supports only version 2 queries, but is used only by
- * mount_clnt for NFS_ROOT.
+ * XXX: Needs to support IPv6, and rpcbind versions 3 and 4
  */
-int rpcb_getport_external(struct sockaddr_in *sin, __u32 prog,
-				__u32 vers, int prot)
+int rpcb_getport_sync(struct sockaddr_in *sin, __u32 prog,
+		      __u32 vers, int prot)
 {
 	struct rpcbind_args map = {
 		.r_prog		= prog,
@@ -278,10 +279,10 @@ int rpcb_getport_external(struct sockaddr_in *sin, __u32 prog,
 	char hostname[40];
 	int status;
 
-	dprintk("RPC:       rpcb_getport_external(%u.%u.%u.%u, %u, %u, %d)\n",
-			NIPQUAD(sin->sin_addr.s_addr), prog, vers, prot);
+	dprintk("RPC:       %s(" NIPQUAD_FMT ", %u, %u, %d)\n",
+		__FUNCTION__, NIPQUAD(sin->sin_addr.s_addr), prog, vers, prot);
 
-	sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(sin->sin_addr.s_addr));
+	sprintf(hostname, NIPQUAD_FMT, NIPQUAD(sin->sin_addr.s_addr));
 	rpcb_clnt = rpcb_create(hostname, (struct sockaddr *)sin, prot, 2, 0);
 	if (IS_ERR(rpcb_clnt))
 		return PTR_ERR(rpcb_clnt);
@@ -296,7 +297,7 @@ int rpcb_getport_external(struct sockaddr_in *sin, __u32 prog,
 	}
 	return status;
 }
-#endif
+EXPORT_SYMBOL_GPL(rpcb_getport_sync);
 
 /**
  * rpcb_getport - obtain the port for a given RPC service on a given host
-- 
cgit v1.3-8-gc7d7


From 45160d6275814e0c86206e6981f0b92c61a50a21 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 1 Jul 2007 12:13:17 -0400
Subject: SUNRPC: Rename rpcb_getport to be consistent with new
 rpcb_getport_sync name

Clean up, for consistency.  Rename rpcb_getport as rpcb_getport_async, to
match the naming scheme of rpcb_getport_sync.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h |  2 +-
 net/sunrpc/rpcb_clnt.c      | 37 +++++++++++++++++++------------------
 net/sunrpc/xprtsock.c       |  4 ++--
 3 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index b28d919c7758..c1b37972b0d5 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -122,8 +122,8 @@ void		rpc_shutdown_client(struct rpc_clnt *);
 void		rpc_release_client(struct rpc_clnt *);
 
 int		rpcb_register(u32, u32, int, unsigned short, int *);
-void		rpcb_getport(struct rpc_task *);
 int		rpcb_getport_sync(struct sockaddr_in *, __u32, __u32, int);
+void		rpcb_getport_async(struct rpc_task *);
 
 void		rpc_call_setup(struct rpc_task *, struct rpc_message *, int);
 
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index fc881a675eb9..d1740dbab991 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -300,13 +300,13 @@ int rpcb_getport_sync(struct sockaddr_in *sin, __u32 prog,
 EXPORT_SYMBOL_GPL(rpcb_getport_sync);
 
 /**
- * rpcb_getport - obtain the port for a given RPC service on a given host
+ * rpcb_getport_async - obtain the port for a given RPC service on a given host
  * @task: task that is waiting for portmapper request
  *
  * This one can be called for an ongoing RPC request, and can be used in
  * an async (rpciod) context.
  */
-void rpcb_getport(struct rpc_task *task)
+void rpcb_getport_async(struct rpc_task *task)
 {
 	struct rpc_clnt *clnt = task->tk_client;
 	int bind_version;
@@ -317,17 +317,17 @@ void rpcb_getport(struct rpc_task *task)
 	struct sockaddr addr;
 	int status;
 
-	dprintk("RPC: %5u rpcb_getport(%s, %u, %u, %d)\n",
-			task->tk_pid, clnt->cl_server,
-			clnt->cl_prog, clnt->cl_vers, xprt->prot);
+	dprintk("RPC: %5u %s(%s, %u, %u, %d)\n",
+		task->tk_pid, __FUNCTION__,
+		clnt->cl_server, clnt->cl_prog, clnt->cl_vers, xprt->prot);
 
 	/* Autobind on cloned rpc clients is discouraged */
 	BUG_ON(clnt->cl_parent != clnt);
 
 	if (xprt_test_and_set_binding(xprt)) {
 		status = -EACCES;		/* tell caller to check again */
-		dprintk("RPC: %5u rpcb_getport waiting for another binder\n",
-				task->tk_pid);
+		dprintk("RPC: %5u %s: waiting for another binder\n",
+			task->tk_pid, __FUNCTION__);
 		goto bailout_nowake;
 	}
 
@@ -338,27 +338,28 @@ void rpcb_getport(struct rpc_task *task)
 	/* Someone else may have bound if we slept */
 	if (xprt_bound(xprt)) {
 		status = 0;
-		dprintk("RPC: %5u rpcb_getport already bound\n", task->tk_pid);
+		dprintk("RPC: %5u %s: already bound\n",
+			task->tk_pid, __FUNCTION__);
 		goto bailout_nofree;
 	}
 
 	if (rpcb_next_version[xprt->bind_index].rpc_proc == NULL) {
 		xprt->bind_index = 0;
 		status = -EACCES;	/* tell caller to try again later */
-		dprintk("RPC: %5u rpcb_getport no more getport versions "
-				"available\n", task->tk_pid);
+		dprintk("RPC: %5u %s: no more getport versions available\n",
+			task->tk_pid, __FUNCTION__);
 		goto bailout_nofree;
 	}
 	bind_version = rpcb_next_version[xprt->bind_index].rpc_vers;
 
-	dprintk("RPC: %5u rpcb_getport trying rpcbind version %u\n",
-			task->tk_pid, bind_version);
+	dprintk("RPC: %5u %s: trying rpcbind version %u\n",
+		task->tk_pid, __FUNCTION__, bind_version);
 
 	map = kzalloc(sizeof(struct rpcbind_args), GFP_ATOMIC);
 	if (!map) {
 		status = -ENOMEM;
-		dprintk("RPC: %5u rpcb_getport no memory available\n",
-				task->tk_pid);
+		dprintk("RPC: %5u %s: no memory available\n",
+			task->tk_pid, __FUNCTION__);
 		goto bailout_nofree;
 	}
 	map->r_prog = clnt->cl_prog;
@@ -376,8 +377,8 @@ void rpcb_getport(struct rpc_task *task)
 	rpcb_clnt = rpcb_create(clnt->cl_server, &addr, xprt->prot, bind_version, 0);
 	if (IS_ERR(rpcb_clnt)) {
 		status = PTR_ERR(rpcb_clnt);
-		dprintk("RPC: %5u rpcb_getport rpcb_create failed, error %ld\n",
-				task->tk_pid, PTR_ERR(rpcb_clnt));
+		dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n",
+			task->tk_pid, __FUNCTION__, PTR_ERR(rpcb_clnt));
 		goto bailout;
 	}
 
@@ -385,8 +386,8 @@ void rpcb_getport(struct rpc_task *task)
 	rpc_release_client(rpcb_clnt);
 	if (IS_ERR(child)) {
 		status = -EIO;
-		dprintk("RPC: %5u rpcb_getport rpc_run_task failed\n",
-				task->tk_pid);
+		dprintk("RPC: %5u %s: rpc_run_task failed\n",
+			task->tk_pid, __FUNCTION__);
 		goto bailout_nofree;
 	}
 	rpc_put_task(child);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index ee6ad3baf680..a8f7c5fd752c 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1473,7 +1473,7 @@ static struct rpc_xprt_ops xs_udp_ops = {
 	.set_buffer_size	= xs_udp_set_buffer_size,
 	.reserve_xprt		= xprt_reserve_xprt_cong,
 	.release_xprt		= xprt_release_xprt_cong,
-	.rpcbind		= rpcb_getport,
+	.rpcbind		= rpcb_getport_async,
 	.set_port		= xs_set_port,
 	.connect		= xs_connect,
 	.buf_alloc		= rpc_malloc,
@@ -1490,7 +1490,7 @@ static struct rpc_xprt_ops xs_udp_ops = {
 static struct rpc_xprt_ops xs_tcp_ops = {
 	.reserve_xprt		= xprt_reserve_xprt,
 	.release_xprt		= xs_tcp_release_xprt,
-	.rpcbind		= rpcb_getport,
+	.rpcbind		= rpcb_getport_async,
 	.set_port		= xs_set_port,
 	.connect		= xs_connect,
 	.buf_alloc		= rpc_malloc,
-- 
cgit v1.3-8-gc7d7


From 3ea97309e6b18bce200211b3f9188e8023321adc Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 1 Jul 2007 12:13:27 -0400
Subject: NFS: Remake nfsroot_mount as a permanent part of NFS client

In preparation for supporting NFSv2 and NFSv3 mount option handling in the
kernel NFS client, convert mount_clnt.c to be a permanent part of the NFS
client, instead of built only when CONFIG_ROOT_NFS is enabled.

In addition, we also replace the "struct sockaddr_in *" argument with
something more generic, to help support IPv6 at some later point.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/Makefile        |  4 ++--
 fs/nfs/mount_clnt.c    | 55 +++++++++++++++++++++++++-------------------------
 fs/nfs/nfsroot.c       |  3 ++-
 include/linux/nfs_fs.h |  5 ++---
 4 files changed, 34 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index f4580b44eef4..b55cb236cf74 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,8 +6,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o
 
 nfs-y 			:= client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
 			   pagelist.o proc.o read.o symlink.o unlink.o \
-			   write.o namespace.o
-nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o mount_clnt.o      
+			   write.o namespace.o mount_clnt.o
+nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o
 nfs-$(CONFIG_NFS_V3)	+= nfs3proc.o nfs3xdr.o
 nfs-$(CONFIG_NFS_V3_ACL)	+= nfs3acl.o
 nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 2892ec843066..ee4899d96629 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -18,7 +18,7 @@
 #include <linux/nfs_fs.h>
 
 #ifdef RPC_DEBUG
-# define NFSDBG_FACILITY	NFSDBG_ROOT
+# define NFSDBG_FACILITY	NFSDBG_MOUNT
 #endif
 
 /*
@@ -28,7 +28,6 @@
 #define MOUNT_UMNT		3
  */
 
-static struct rpc_clnt *	mnt_create(struct sockaddr_in *, int, int);
 static struct rpc_program	mnt_program;
 
 struct mnt_fhstatus {
@@ -36,14 +35,21 @@ struct mnt_fhstatus {
 	struct nfs_fh *		fh;
 };
 
-/*
- * Obtain an NFS file handle for the given host and path
+/**
+ * nfs_mount - Obtain an NFS file handle for the given host and path
+ * @addr: pointer to server's address
+ * @len: size of server's address
+ * @hostname: name of server host, or NULL
+ * @path: pointer to string containing export path to mount
+ * @version: mount version to use for this request
+ * @protocol: transport protocol to use for thie request
+ * @fh: pointer to location to place returned file handle
+ *
+ * Uses default timeout parameters specified by underlying transport.
  */
-int
-nfsroot_mount(struct sockaddr_in *addr, char *path, struct nfs_fh *fh,
-		int version, int protocol)
+int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path,
+	      int version, int protocol, struct nfs_fh *fh)
 {
-	struct rpc_clnt		*mnt_clnt;
 	struct mnt_fhstatus	result = {
 		.fh		= fh
 	};
@@ -51,12 +57,23 @@ nfsroot_mount(struct sockaddr_in *addr, char *path, struct nfs_fh *fh,
 		.rpc_argp	= path,
 		.rpc_resp	= &result,
 	};
+	struct rpc_create_args args = {
+		.protocol	= protocol,
+		.address	= addr,
+		.addrsize	= len,
+		.servername	= hostname,
+		.program	= &mnt_program,
+		.version	= version,
+		.authflavor	= RPC_AUTH_UNIX,
+		.flags		= RPC_CLNT_CREATE_INTR,
+	};
+	struct rpc_clnt		*mnt_clnt;
 	int			status;
 
-	dprintk("NFS:      nfs_mount(%08x:%s)\n",
-			(unsigned)ntohl(addr->sin_addr.s_addr), path);
+	dprintk("NFS: sending MNT request for %s:%s\n",
+		(hostname ? hostname : "server"), path);
 
-	mnt_clnt = mnt_create(addr, version, protocol);
+	mnt_clnt = rpc_create(&args);
 	if (IS_ERR(mnt_clnt))
 		return PTR_ERR(mnt_clnt);
 
@@ -70,22 +87,6 @@ nfsroot_mount(struct sockaddr_in *addr, char *path, struct nfs_fh *fh,
 	return status < 0? status : (result.status? -EACCES : 0);
 }
 
-static struct rpc_clnt *mnt_create(struct sockaddr_in *srvaddr, int version,
-				   int protocol)
-{
-	struct rpc_create_args args = {
-		.protocol	= protocol,
-		.address	= (struct sockaddr *)srvaddr,
-		.addrsize	= sizeof(*srvaddr),
-		.program	= &mnt_program,
-		.version	= version,
-		.authflavor	= RPC_AUTH_UNIX,
-		.flags		= RPC_CLNT_CREATE_INTR,
-	};
-
-	return rpc_create(&args);
-}
-
 /*
  * XDR encode/decode functions for MOUNT
  */
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index f0db4703b1c9..3490322d1145 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -496,7 +496,8 @@ static int __init root_nfs_get_handle(void)
 					NFS_MNT3_VERSION : NFS_MNT_VERSION;
 
 	set_sockaddr(&sin, servaddr, htons(mount_port));
-	status = nfsroot_mount(&sin, nfs_path, &fh, version, protocol);
+	status = nfs_mount((struct sockaddr *) &sin, sizeof(sin), NULL,
+			   nfs_path, version, protocol, &fh);
 	if (status < 0)
 		printk(KERN_ERR "Root-NFS: Server returned error %d "
 				"while mounting %s\n", status, nfs_path);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 04f659f1e560..c098ae194f79 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -494,10 +494,9 @@ static inline void nfs3_forget_cached_acls(struct inode *inode)
 
 /*
  * linux/fs/mount_clnt.c
- * (Used only by nfsroot module)
  */
-extern int  nfsroot_mount(struct sockaddr_in *, char *, struct nfs_fh *,
-		int, int);
+extern int  nfs_mount(struct sockaddr *, size_t, char *, char *,
+		      int, int, struct nfs_fh *);
 
 /*
  * inline functions
-- 
cgit v1.3-8-gc7d7


From 8007122520f0a3599bdc4df47358a5d83b2574aa Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 1 Jul 2007 12:13:59 -0400
Subject: NFS: Add support for mounting NFSv4 file systems with string options

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c             | 91 ++++++++++++++++++++++++++++++++++++++++++----
 include/linux/nfs4_mount.h |  2 +-
 2 files changed, 85 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 757aa3b7e64b..064e69d2fdde 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1541,8 +1541,90 @@ static int nfs4_validate_mount_data(struct nfs4_mount_data **options,
 		*ip_addr = c;
 
 		break;
-	default:
-		goto out_bad_version;
+	default: {
+		unsigned int len;
+		struct nfs_parsed_mount_data args = {
+			.rsize		= NFS_MAX_FILE_IO_SIZE,
+			.wsize		= NFS_MAX_FILE_IO_SIZE,
+			.timeo		= 600,
+			.retrans	= 2,
+			.acregmin	= 3,
+			.acregmax	= 60,
+			.acdirmin	= 30,
+			.acdirmax	= 60,
+			.nfs_server.protocol = IPPROTO_TCP,
+		};
+
+		if (nfs_parse_mount_options((char *) *options, &args) == 0)
+			return -EINVAL;
+
+		if (!nfs_verify_server_address((struct sockaddr *)
+						&args.nfs_server.address))
+			return -EINVAL;
+		*addr = args.nfs_server.address;
+
+		switch (args.auth_flavor_len) {
+		case 0:
+			*authflavour = RPC_AUTH_UNIX;
+			break;
+		case 1:
+			*authflavour = (rpc_authflavor_t) args.auth_flavors[0];
+			break;
+		default:
+			goto out_inval_auth;
+		}
+
+		/*
+		 * Translate to nfs4_mount_data, which nfs4_fill_super
+		 * can deal with.
+		 */
+		data = kzalloc(sizeof(*data), GFP_KERNEL);
+		if (data == NULL)
+			return -ENOMEM;
+		*options = data;
+
+		data->version	= 1;
+		data->flags	= args.flags & NFS4_MOUNT_FLAGMASK;
+		data->rsize	= args.rsize;
+		data->wsize	= args.wsize;
+		data->timeo	= args.timeo;
+		data->retrans	= args.retrans;
+		data->acregmin	= args.acregmin;
+		data->acregmax	= args.acregmax;
+		data->acdirmin	= args.acdirmin;
+		data->acdirmax	= args.acdirmax;
+		data->proto	= args.nfs_server.protocol;
+
+		/*
+		 * Split "dev_name" into "hostname:mntpath".
+		 */
+		c = strchr(dev_name, ':');
+		if (c == NULL)
+			return -EINVAL;
+		/* while calculating len, pretend ':' is '\0' */
+		len = c - dev_name;
+		if (len > NFS4_MAXNAMLEN)
+			return -EINVAL;
+		*hostname = kzalloc(len, GFP_KERNEL);
+		if (*hostname == NULL)
+			return -ENOMEM;
+		strncpy(*hostname, dev_name, len - 1);
+
+		c++;			/* step over the ':' */
+		len = strlen(c);
+		if (len > NFS4_MAXPATHLEN)
+			return -EINVAL;
+		*mntpath = kzalloc(len + 1, GFP_KERNEL);
+		if (*mntpath == NULL)
+			return -ENOMEM;
+		strncpy(*mntpath, c, len);
+
+		dprintk("MNTPATH: %s\n", *mntpath);
+
+		*ip_addr = args.client_address;
+
+		break;
+		}
 	}
 
 	return 0;
@@ -1559,11 +1641,6 @@ out_inval_auth:
 out_no_address:
 	dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
 	return -EINVAL;
-
-out_bad_version:
-	dfprintk(MOUNT, "NFS4: bad nfs_mount_data version %d\n",
-		 data->version);
-	return -EINVAL;
 }
 
 /*
diff --git a/include/linux/nfs4_mount.h b/include/linux/nfs4_mount.h
index 26b4c83f831d..d8d7480e5a47 100644
--- a/include/linux/nfs4_mount.h
+++ b/include/linux/nfs4_mount.h
@@ -65,6 +65,6 @@ struct nfs4_mount_data {
 #define NFS4_MOUNT_NOCTO	0x0010	/* 1 */
 #define NFS4_MOUNT_NOAC		0x0020	/* 1 */
 #define NFS4_MOUNT_STRICTLOCK	0x1000	/* 1 */
-#define NFS4_MOUNT_FLAGMASK	0xFFFF
+#define NFS4_MOUNT_FLAGMASK	0x1033
 
 #endif
-- 
cgit v1.3-8-gc7d7


From 75180df2ed467866ada839fe73cf7cc7d75c0a22 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 16 May 2007 16:53:28 -0400
Subject: NFS: Add the mount option "nosharecache"

Prior to David Howell's mount changes in 2.6.18, users who mounted
different directories which happened to be from the same filesystem on the
server would get different super blocks, and hence could choose different
mount options. As long as there were no hard linked files that crossed from
one subtree to another, this was quite safe.
Post the changes, if the two directories are on the same filesystem (have
the same 'fsid'), they will share the same super block, and hence the same
mount options.

Add a flag to allow users to elect not to share the NFS super block with
another mount point, even if the fsids are the same. This will allow
users to set different mount options for the two different super blocks, as
was previously possible. It is still up to the user to ensure that there
are no cache coherency issues when doing this, however the default
behaviour will be to share super blocks whenever two paths result in
the same fsid.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c             | 43 ++++++++++++++++++++++++++++++++++++++-----
 include/linux/nfs4_mount.h |  3 ++-
 include/linux/nfs_mount.h  |  1 +
 3 files changed, 41 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 064e69d2fdde..1b555cd41e3b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -100,6 +100,7 @@ enum {
 	Opt_udp, Opt_tcp,
 	Opt_acl, Opt_noacl,
 	Opt_rdirplus, Opt_nordirplus,
+	Opt_sharecache, Opt_nosharecache,
 
 	/* Mount options that take integer arguments */
 	Opt_port,
@@ -146,6 +147,8 @@ static match_table_t nfs_mount_option_tokens = {
 	{ Opt_noacl, "noacl" },
 	{ Opt_rdirplus, "rdirplus" },
 	{ Opt_nordirplus, "nordirplus" },
+	{ Opt_sharecache, "sharecache" },
+	{ Opt_nosharecache, "nosharecache" },
 
 	{ Opt_port, "port=%u" },
 	{ Opt_rsize, "rsize=%u" },
@@ -450,6 +453,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 		{ NFS_MOUNT_NONLM, ",nolock", "" },
 		{ NFS_MOUNT_NOACL, ",noacl", "" },
 		{ NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
+		{ NFS_MOUNT_UNSHARED, ",nosharecache", ""},
 		{ 0, NULL, NULL }
 	};
 	const struct proc_nfs_info *nfs_infop;
@@ -714,6 +718,12 @@ static int nfs_parse_mount_options(char *raw,
 		case Opt_nordirplus:
 			mnt->flags |= NFS_MOUNT_NORDIRPLUS;
 			break;
+		case Opt_sharecache:
+			mnt->flags &= ~NFS_MOUNT_UNSHARED;
+			break;
+		case Opt_nosharecache:
+			mnt->flags |= NFS_MOUNT_UNSHARED;
+			break;
 
 		case Opt_port:
 			if (match_int(args, &option))
@@ -1309,6 +1319,9 @@ static int nfs_compare_super(struct super_block *sb, void *data)
 
 	if (old->nfs_client != server->nfs_client)
 		return 0;
+	/* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */
+	if (old->flags & NFS_MOUNT_UNSHARED)
+		return 0;
 	if (memcmp(&old->fsid, &server->fsid, sizeof(old->fsid)) != 0)
 		return 0;
 	return 1;
@@ -1322,6 +1335,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	struct nfs_fh mntfh;
 	struct nfs_mount_data *data = raw_data;
 	struct dentry *mntroot;
+	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
 	int error;
 
 	/* Validate the mount data */
@@ -1336,8 +1350,11 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 		goto out;
 	}
 
+	if (server->flags & NFS_MOUNT_UNSHARED)
+		compare_super = NULL;
+
 	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
+	s = sget(fs_type, compare_super, nfs_set_super, server);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto out_err_nosb;
@@ -1402,6 +1419,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	struct super_block *s;
 	struct nfs_server *server;
 	struct dentry *mntroot;
+	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
 	int error;
 
 	dprintk("--> nfs_xdev_get_sb()\n");
@@ -1413,8 +1431,11 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
 		goto out_err_noserver;
 	}
 
+	if (server->flags & NFS_MOUNT_UNSHARED)
+		compare_super = NULL;
+
 	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
+	s = sget(&nfs_fs_type, compare_super, nfs_set_super, server);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto out_err_nosb;
@@ -1657,6 +1678,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 	struct nfs_fh mntfh;
 	struct dentry *mntroot;
 	char *mntpath = NULL, *hostname = NULL, *ip_addr = NULL;
+	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
 	int error;
 
 	/* Validate the mount data */
@@ -1673,8 +1695,11 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 		goto out;
 	}
 
+	if (server->flags & NFS4_MOUNT_UNSHARED)
+		compare_super = NULL;
+
 	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
+	s = sget(fs_type, compare_super, nfs_set_super, server);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto out_free;
@@ -1740,6 +1765,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	struct super_block *s;
 	struct nfs_server *server;
 	struct dentry *mntroot;
+	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
 	int error;
 
 	dprintk("--> nfs4_xdev_get_sb()\n");
@@ -1751,8 +1777,11 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
 		goto out_err_noserver;
 	}
 
+	if (server->flags & NFS4_MOUNT_UNSHARED)
+		compare_super = NULL;
+
 	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
+	s = sget(&nfs_fs_type, compare_super, nfs_set_super, server);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto out_err_nosb;
@@ -1807,6 +1836,7 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
 	struct nfs_server *server;
 	struct dentry *mntroot;
 	struct nfs_fh mntfh;
+	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
 	int error;
 
 	dprintk("--> nfs4_referral_get_sb()\n");
@@ -1818,8 +1848,11 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
 		goto out_err_noserver;
 	}
 
+	if (server->flags & NFS4_MOUNT_UNSHARED)
+		compare_super = NULL;
+
 	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
+	s = sget(&nfs_fs_type, compare_super, nfs_set_super, server);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto out_err_nosb;
diff --git a/include/linux/nfs4_mount.h b/include/linux/nfs4_mount.h
index d8d7480e5a47..a0dcf6655657 100644
--- a/include/linux/nfs4_mount.h
+++ b/include/linux/nfs4_mount.h
@@ -65,6 +65,7 @@ struct nfs4_mount_data {
 #define NFS4_MOUNT_NOCTO	0x0010	/* 1 */
 #define NFS4_MOUNT_NOAC		0x0020	/* 1 */
 #define NFS4_MOUNT_STRICTLOCK	0x1000	/* 1 */
-#define NFS4_MOUNT_FLAGMASK	0x1033
+#define NFS4_MOUNT_UNSHARED	0x8000	/* 1 */
+#define NFS4_MOUNT_FLAGMASK	0x9033
 
 #endif
diff --git a/include/linux/nfs_mount.h b/include/linux/nfs_mount.h
index 0b82a17c705b..a3ade89a64d2 100644
--- a/include/linux/nfs_mount.h
+++ b/include/linux/nfs_mount.h
@@ -62,6 +62,7 @@ struct nfs_mount_data {
 #define NFS_MOUNT_STRICTLOCK	0x1000	/* reserved for NFSv4 */
 #define NFS_MOUNT_SECFLAVOUR	0x2000	/* 5 */
 #define NFS_MOUNT_NORDIRPLUS	0x4000	/* 5 */
+#define NFS_MOUNT_UNSHARED	0x8000	/* 5 */
 #define NFS_MOUNT_FLAGMASK	0xFFFF
 
 #endif
-- 
cgit v1.3-8-gc7d7


From 96802a095171f5b35cf0e1e0d4be943e6696a253 Mon Sep 17 00:00:00 2001
From: Frank van Maarseveen <frankvm@frankvm.com>
Date: Sun, 8 Jul 2007 13:08:54 +0200
Subject: SUNRPC: cleanup transport creation argument passing

Cleanup argument passing to functions for creating an RPC transport.

Signed-off-by: Frank van Maarseveen <frankvm@frankvm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h | 15 +++++++++++----
 net/sunrpc/clnt.c           |  9 +++++++--
 net/sunrpc/xprt.c           | 15 ++++++---------
 net/sunrpc/xprtsock.c       | 36 ++++++++++++++++--------------------
 4 files changed, 40 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 34f7590506fa..ea828b09e4ad 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -17,6 +17,8 @@
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/msg_prot.h>
 
+#ifdef __KERNEL__
+
 extern unsigned int xprt_udp_slot_table_entries;
 extern unsigned int xprt_tcp_slot_table_entries;
 
@@ -194,7 +196,12 @@ struct rpc_xprt {
 	char *			address_strings[RPC_DISPLAY_MAX];
 };
 
-#ifdef __KERNEL__
+struct rpc_xprtsock_create {
+	int			proto;		/* IPPROTO_UDP or IPPROTO_TCP */
+	struct sockaddr *	dstaddr;	/* remote peer address */
+	size_t			addrlen;
+	struct rpc_timeout *	timeout;	/* optional timeout parameters */
+};
 
 /*
  * Transport operations used by ULPs
@@ -204,7 +211,7 @@ void			xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long
 /*
  * Generic internal transport functions
  */
-struct rpc_xprt *	xprt_create_transport(int proto, struct sockaddr *addr, size_t size, struct rpc_timeout *toparms);
+struct rpc_xprt *	xprt_create_transport(struct rpc_xprtsock_create *args);
 void			xprt_connect(struct rpc_task *task);
 void			xprt_reserve(struct rpc_task *task);
 int			xprt_reserve_xprt(struct rpc_task *task);
@@ -242,8 +249,8 @@ void			xprt_disconnect(struct rpc_xprt *xprt);
 /*
  * Socket transport setup operations
  */
-struct rpc_xprt *	xs_setup_udp(struct sockaddr *addr, size_t addrlen, struct rpc_timeout *to);
-struct rpc_xprt *	xs_setup_tcp(struct sockaddr *addr, size_t addrlen, struct rpc_timeout *to);
+struct rpc_xprt *	xs_setup_udp(struct rpc_xprtsock_create *args);
+struct rpc_xprt *	xs_setup_tcp(struct rpc_xprtsock_create *args);
 int			init_socket_xprt(void);
 void			cleanup_socket_xprt(void);
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 0d9b5275fac3..25bbf2d7f603 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -234,10 +234,15 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 {
 	struct rpc_xprt *xprt;
 	struct rpc_clnt *clnt;
+	struct rpc_xprtsock_create xprtargs = {
+		.proto = args->protocol,
+		.dstaddr = args->address,
+		.addrlen = args->addrsize,
+		.timeout = args->timeout
+	};
 	char servername[20];
 
-	xprt = xprt_create_transport(args->protocol, args->address,
-					args->addrsize, args->timeout);
+	xprt = xprt_create_transport(&xprtargs);
 	if (IS_ERR(xprt))
 		return (struct rpc_clnt *)xprt;
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 518acb74a5bb..c8c2edccad7e 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -886,27 +886,24 @@ void xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long i
 
 /**
  * xprt_create_transport - create an RPC transport
- * @proto: requested transport protocol
- * @ap: remote peer address
- * @size: length of address
- * @to: timeout parameters
+ * @args: rpc transport creation arguments
  *
  */
-struct rpc_xprt *xprt_create_transport(int proto, struct sockaddr *ap, size_t size, struct rpc_timeout *to)
+struct rpc_xprt *xprt_create_transport(struct rpc_xprtsock_create *args)
 {
 	struct rpc_xprt	*xprt;
 	struct rpc_rqst	*req;
 
-	switch (proto) {
+	switch (args->proto) {
 	case IPPROTO_UDP:
-		xprt = xs_setup_udp(ap, size, to);
+		xprt = xs_setup_udp(args);
 		break;
 	case IPPROTO_TCP:
-		xprt = xs_setup_tcp(ap, size, to);
+		xprt = xs_setup_tcp(args);
 		break;
 	default:
 		printk(KERN_ERR "RPC: unrecognized transport protocol: %d\n",
-				proto);
+				args->proto);
 		return ERR_PTR(-EIO);
 	}
 	if (IS_ERR(xprt)) {
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index a8f7c5fd752c..6b7cea57651c 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1502,12 +1502,12 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 	.print_stats		= xs_tcp_print_stats,
 };
 
-static struct rpc_xprt *xs_setup_xprt(struct sockaddr *addr, size_t addrlen, unsigned int slot_table_size)
+static struct rpc_xprt *xs_setup_xprt(struct rpc_xprtsock_create *args, unsigned int slot_table_size)
 {
 	struct rpc_xprt *xprt;
 	struct sock_xprt *new;
 
-	if (addrlen > sizeof(xprt->addr)) {
+	if (args->addrlen > sizeof(xprt->addr)) {
 		dprintk("RPC:       xs_setup_xprt: address too large\n");
 		return ERR_PTR(-EBADF);
 	}
@@ -1529,8 +1529,8 @@ static struct rpc_xprt *xs_setup_xprt(struct sockaddr *addr, size_t addrlen, uns
 		return ERR_PTR(-ENOMEM);
 	}
 
-	memcpy(&xprt->addr, addr, addrlen);
-	xprt->addrlen = addrlen;
+	memcpy(&xprt->addr, args->dstaddr, args->addrlen);
+	xprt->addrlen = args->addrlen;
 	new->port = xs_get_random_port();
 
 	return xprt;
@@ -1538,22 +1538,20 @@ static struct rpc_xprt *xs_setup_xprt(struct sockaddr *addr, size_t addrlen, uns
 
 /**
  * xs_setup_udp - Set up transport to use a UDP socket
- * @addr: address of remote server
- * @addrlen: length of address in bytes
- * @to:   timeout parameters
+ * @args: rpc transport creation arguments
  *
  */
-struct rpc_xprt *xs_setup_udp(struct sockaddr *addr, size_t addrlen, struct rpc_timeout *to)
+struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args)
 {
 	struct rpc_xprt *xprt;
 	struct sock_xprt *transport;
 
-	xprt = xs_setup_xprt(addr, addrlen, xprt_udp_slot_table_entries);
+	xprt = xs_setup_xprt(args, xprt_udp_slot_table_entries);
 	if (IS_ERR(xprt))
 		return xprt;
 	transport = container_of(xprt, struct sock_xprt, xprt);
 
-	if (ntohs(((struct sockaddr_in *)addr)->sin_port) != 0)
+	if (ntohs(((struct sockaddr_in *)args->dstaddr)->sin_port) != 0)
 		xprt_set_bound(xprt);
 
 	xprt->prot = IPPROTO_UDP;
@@ -1569,8 +1567,8 @@ struct rpc_xprt *xs_setup_udp(struct sockaddr *addr, size_t addrlen, struct rpc_
 
 	xprt->ops = &xs_udp_ops;
 
-	if (to)
-		xprt->timeout = *to;
+	if (args->timeout)
+		xprt->timeout = *args->timeout;
 	else
 		xprt_set_timeout(&xprt->timeout, 5, 5 * HZ);
 
@@ -1583,22 +1581,20 @@ struct rpc_xprt *xs_setup_udp(struct sockaddr *addr, size_t addrlen, struct rpc_
 
 /**
  * xs_setup_tcp - Set up transport to use a TCP socket
- * @addr: address of remote server
- * @addrlen: length of address in bytes
- * @to: timeout parameters
+ * @args: rpc transport creation arguments
  *
  */
-struct rpc_xprt *xs_setup_tcp(struct sockaddr *addr, size_t addrlen, struct rpc_timeout *to)
+struct rpc_xprt *xs_setup_tcp(struct rpc_xprtsock_create *args)
 {
 	struct rpc_xprt *xprt;
 	struct sock_xprt *transport;
 
-	xprt = xs_setup_xprt(addr, addrlen, xprt_tcp_slot_table_entries);
+	xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries);
 	if (IS_ERR(xprt))
 		return xprt;
 	transport = container_of(xprt, struct sock_xprt, xprt);
 
-	if (ntohs(((struct sockaddr_in *)addr)->sin_port) != 0)
+	if (ntohs(((struct sockaddr_in *)args->dstaddr)->sin_port) != 0)
 		xprt_set_bound(xprt);
 
 	xprt->prot = IPPROTO_TCP;
@@ -1613,8 +1609,8 @@ struct rpc_xprt *xs_setup_tcp(struct sockaddr *addr, size_t addrlen, struct rpc_
 
 	xprt->ops = &xs_tcp_ops;
 
-	if (to)
-		xprt->timeout = *to;
+	if (args->timeout)
+		xprt->timeout = *args->timeout;
 	else
 		xprt_set_timeout(&xprt->timeout, 2, 60 * HZ);
 
-- 
cgit v1.3-8-gc7d7


From a97476926ec061f90b77da478620ea6dc71a3237 Mon Sep 17 00:00:00 2001
From: Frank van Maarseveen <frankvm@frankvm.com>
Date: Mon, 9 Jul 2007 22:21:39 +0200
Subject: SUNRPC server: record the destination address of a request

Save the destination address of an incoming request over TCP like is
done already for UDP. It is necessary later for callbacks by the server.

Signed-off-by: Frank van Maarseveen <frankvm@frankvm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/svcsock.h |  1 +
 net/sunrpc/svcsock.c           | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index e21dd93ac4b7..a53e0fa855d2 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -59,6 +59,7 @@ struct svc_sock {
 	/* cache of various info for TCP sockets */
 	void			*sk_info_authunix;
 
+	struct sockaddr_storage	sk_local;	/* local address */
 	struct sockaddr_storage	sk_remote;	/* remote peer's address */
 	int			sk_remotelen;	/* length of address */
 };
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 5baf48de2558..64b9b8c743c4 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -644,6 +644,7 @@ svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen)
 	struct msghdr msg = {
 		.msg_flags	= MSG_DONTWAIT,
 	};
+	struct sockaddr *sin;
 	int len;
 
 	len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen,
@@ -654,6 +655,19 @@ svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen)
 	memcpy(&rqstp->rq_addr, &svsk->sk_remote, svsk->sk_remotelen);
 	rqstp->rq_addrlen = svsk->sk_remotelen;
 
+	/* Destination address in request is needed for binding the
+	 * source address in RPC callbacks later.
+	 */
+	sin = (struct sockaddr *)&svsk->sk_local;
+	switch (sin->sa_family) {
+	case AF_INET:
+		rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr;
+		break;
+	case AF_INET6:
+		rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr;
+		break;
+	}
+
 	dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
 		svsk, iov[0].iov_base, iov[0].iov_len, len);
 
@@ -1064,6 +1078,12 @@ svc_tcp_accept(struct svc_sock *svsk)
 		goto failed;
 	memcpy(&newsvsk->sk_remote, sin, slen);
 	newsvsk->sk_remotelen = slen;
+	err = kernel_getsockname(newsock, sin, &slen);
+	if (unlikely(err < 0)) {
+		dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err);
+		slen = offsetof(struct sockaddr, sa_data);
+	}
+	memcpy(&newsvsk->sk_local, sin, slen);
 
 	svc_sock_received(newsvsk);
 
-- 
cgit v1.3-8-gc7d7


From d3bc9a1deb8964d774af8535814cb91bf8f6def0 Mon Sep 17 00:00:00 2001
From: Frank van Maarseveen <frankvm@frankvm.com>
Date: Mon, 9 Jul 2007 22:23:35 +0200
Subject: SUNRPC client: add interface for binding to a local address

In addition to binding to a local privileged port the NFS client should
allow binding to a specific local address. This is used by the server
for callbacks. The patch adds the necessary interface.

Signed-off-by: Frank van Maarseveen <frankvm@frankvm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h |  1 +
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/clnt.c           |  1 +
 net/sunrpc/xprtsock.c       | 24 ++++++++++++++++--------
 4 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index c1b37972b0d5..c0d9d14983b3 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -98,6 +98,7 @@ struct rpc_create_args {
 	int			protocol;
 	struct sockaddr		*address;
 	size_t			addrsize;
+	struct sockaddr		*saddress;
 	struct rpc_timeout	*timeout;
 	char			*servername;
 	struct rpc_program	*program;
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index ea828b09e4ad..d11cedd14f0f 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -198,6 +198,7 @@ struct rpc_xprt {
 
 struct rpc_xprtsock_create {
 	int			proto;		/* IPPROTO_UDP or IPPROTO_TCP */
+	struct sockaddr *	srcaddr;	/* optional local address */
 	struct sockaddr *	dstaddr;	/* remote peer address */
 	size_t			addrlen;
 	struct rpc_timeout *	timeout;	/* optional timeout parameters */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 25bbf2d7f603..5d3fe7b22488 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -236,6 +236,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 	struct rpc_clnt *clnt;
 	struct rpc_xprtsock_create xprtargs = {
 		.proto = args->protocol,
+		.srcaddr = args->saddress,
 		.dstaddr = args->address,
 		.addrlen = args->addrsize,
 		.timeout = args->timeout
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 6b7cea57651c..4ae7eed7f617 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -235,6 +235,7 @@ struct sock_xprt {
 	 * Connection of transports
 	 */
 	struct delayed_work	connect_worker;
+	struct sockaddr_storage	addr;
 	unsigned short		port;
 
 	/*
@@ -1145,31 +1146,36 @@ static void xs_set_port(struct rpc_xprt *xprt, unsigned short port)
 	sap->sin_port = htons(port);
 }
 
-static int xs_bindresvport(struct sock_xprt *transport, struct socket *sock)
+static int xs_bind(struct sock_xprt *transport, struct socket *sock)
 {
 	struct sockaddr_in myaddr = {
 		.sin_family = AF_INET,
 	};
+	struct sockaddr_in *sa;
 	int err;
 	unsigned short port = transport->port;
 
+	if (!transport->xprt.resvport)
+		port = 0;
+	sa = (struct sockaddr_in *)&transport->addr;
+	myaddr.sin_addr = sa->sin_addr;
 	do {
 		myaddr.sin_port = htons(port);
 		err = kernel_bind(sock, (struct sockaddr *) &myaddr,
 						sizeof(myaddr));
+		if (!transport->xprt.resvport)
+			break;
 		if (err == 0) {
 			transport->port = port;
-			dprintk("RPC:       xs_bindresvport bound to port %u\n",
-					port);
-			return 0;
+			break;
 		}
 		if (port <= xprt_min_resvport)
 			port = xprt_max_resvport;
 		else
 			port--;
 	} while (err == -EADDRINUSE && port != transport->port);
-
-	dprintk("RPC:       can't bind to reserved port (%d).\n", -err);
+	dprintk("RPC:       xs_bind "NIPQUAD_FMT":%u: %s (%d)\n",
+		NIPQUAD(myaddr.sin_addr), port, err ? "failed" : "ok", err);
 	return err;
 }
 
@@ -1228,7 +1234,7 @@ static void xs_udp_connect_worker(struct work_struct *work)
 	}
 	xs_reclassify_socket(sock);
 
-	if (xprt->resvport && xs_bindresvport(transport, sock) < 0) {
+	if (xs_bind(transport, sock)) {
 		sock_release(sock);
 		goto out;
 	}
@@ -1315,7 +1321,7 @@ static void xs_tcp_connect_worker(struct work_struct *work)
 		}
 		xs_reclassify_socket(sock);
 
-		if (xprt->resvport && xs_bindresvport(transport, sock) < 0) {
+		if (xs_bind(transport, sock)) {
 			sock_release(sock);
 			goto out;
 		}
@@ -1531,6 +1537,8 @@ static struct rpc_xprt *xs_setup_xprt(struct rpc_xprtsock_create *args, unsigned
 
 	memcpy(&xprt->addr, args->dstaddr, args->addrlen);
 	xprt->addrlen = args->addrlen;
+	if (args->srcaddr)
+		memcpy(&new->addr, args->srcaddr, args->addrlen);
 	new->port = xs_get_random_port();
 
 	return xprt;
-- 
cgit v1.3-8-gc7d7


From c98451bdb2f3e6d6cc1e03adad641e9497512b49 Mon Sep 17 00:00:00 2001
From: Frank van Maarseveen <frankvm@frankvm.com>
Date: Mon, 9 Jul 2007 22:25:29 +0200
Subject: NLM: fix source address of callback to client

Use the destination address of the original NLM request as the
source address in callbacks to the client.

Signed-off-by: Frank van Maarseveen <frankvm@frankvm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/host.c             | 27 +++++++++++++++++++--------
 include/linux/lockd/lockd.h |  1 +
 2 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index c252a1c95857..572601e98dcd 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -44,9 +44,8 @@ static struct nsm_handle *	nsm_find(const struct sockaddr_in *sin,
  */
 static struct nlm_host *
 nlm_lookup_host(int server, const struct sockaddr_in *sin,
-					int proto, int version,
-					const char *hostname,
-					int hostname_len)
+		int proto, int version, const char *hostname,
+		int hostname_len, const struct sockaddr_in *ssin)
 {
 	struct hlist_head *chain;
 	struct hlist_node *pos;
@@ -54,7 +53,9 @@ nlm_lookup_host(int server, const struct sockaddr_in *sin,
 	struct nsm_handle *nsm = NULL;
 	int		hash;
 
-	dprintk("lockd: nlm_lookup_host(%u.%u.%u.%u, p=%d, v=%d, my role=%s, name=%.*s)\n",
+	dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT
+			", p=%d, v=%d, my role=%s, name=%.*s)\n",
+			NIPQUAD(ssin->sin_addr.s_addr),
 			NIPQUAD(sin->sin_addr.s_addr), proto, version,
 			server? "server" : "client",
 			hostname_len,
@@ -91,6 +92,8 @@ nlm_lookup_host(int server, const struct sockaddr_in *sin,
 			continue;
 		if (host->h_server != server)
 			continue;
+		if (!nlm_cmp_addr(&host->h_saddr, ssin))
+			continue;
 
 		/* Move to head of hash chain. */
 		hlist_del(&host->h_hash);
@@ -118,6 +121,7 @@ nlm_lookup_host(int server, const struct sockaddr_in *sin,
 	host->h_name	   = nsm->sm_name;
 	host->h_addr       = *sin;
 	host->h_addr.sin_port = 0;	/* ouch! */
+	host->h_saddr	   = *ssin;
 	host->h_version    = version;
 	host->h_proto      = proto;
 	host->h_rpcclnt    = NULL;
@@ -174,8 +178,10 @@ struct nlm_host *
 nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version,
 			const char *hostname, int hostname_len)
 {
+	struct sockaddr_in ssin = {0};
+
 	return nlm_lookup_host(0, sin, proto, version,
-			       hostname, hostname_len);
+			       hostname, hostname_len, &ssin);
 }
 
 /*
@@ -185,9 +191,12 @@ struct nlm_host *
 nlmsvc_lookup_host(struct svc_rqst *rqstp,
 			const char *hostname, int hostname_len)
 {
+	struct sockaddr_in ssin = {0};
+
+	ssin.sin_addr = rqstp->rq_daddr.addr;
 	return nlm_lookup_host(1, svc_addr_in(rqstp),
 			       rqstp->rq_prot, rqstp->rq_vers,
-			       hostname, hostname_len);
+			       hostname, hostname_len, &ssin);
 }
 
 /*
@@ -198,8 +207,9 @@ nlm_bind_host(struct nlm_host *host)
 {
 	struct rpc_clnt	*clnt;
 
-	dprintk("lockd: nlm_bind_host(%08x)\n",
-			(unsigned)ntohl(host->h_addr.sin_addr.s_addr));
+	dprintk("lockd: nlm_bind_host("NIPQUAD_FMT"->"NIPQUAD_FMT")\n",
+			NIPQUAD(host->h_saddr.sin_addr),
+			NIPQUAD(host->h_addr.sin_addr));
 
 	/* Lock host handle */
 	mutex_lock(&host->h_mutex);
@@ -226,6 +236,7 @@ nlm_bind_host(struct nlm_host *host)
 			.protocol	= host->h_proto,
 			.address	= (struct sockaddr *)&host->h_addr,
 			.addrsize	= sizeof(host->h_addr),
+			.saddress	= (struct sockaddr *)&host->h_saddr,
 			.timeout	= &timeparms,
 			.servername	= host->h_name,
 			.program	= &nlm_program,
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 05707e2fccae..e2d1ce36b367 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -39,6 +39,7 @@
 struct nlm_host {
 	struct hlist_node	h_hash;		/* doubly linked list */
 	struct sockaddr_in	h_addr;		/* peer address */
+	struct sockaddr_in	h_saddr;	/* our address (optional) */
 	struct rpc_clnt	*	h_rpcclnt;	/* RPC client to talk to peer */
 	char *			h_name;		/* remote hostname */
 	u32			h_version;	/* interface version */
-- 
cgit v1.3-8-gc7d7


From e06e7c615877026544ad7f8b309d1a3706410383 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Sun, 10 Jun 2007 17:22:39 -0700
Subject: [IPV4]: The scheduled removal of multipath cached routing support.

With help from Chris Wedgwood.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/feature-removal-schedule.txt |  19 --
 include/linux/Kbuild                       |   1 -
 include/linux/ip_mp_alg.h                  |  22 --
 include/linux/rtnetlink.h                  |   2 +-
 include/net/dst.h                          |   1 -
 include/net/ip_fib.h                       |  16 --
 include/net/ip_mp_alg.h                    |  96 ---------
 include/net/route.h                        |   1 -
 net/ipv4/Kconfig                           |  42 ----
 net/ipv4/Makefile                          |   5 -
 net/ipv4/fib_frontend.c                    |   4 -
 net/ipv4/fib_semantics.c                   |  16 --
 net/ipv4/multipath.c                       |  55 -----
 net/ipv4/multipath_drr.c                   | 249 ----------------------
 net/ipv4/multipath_random.c                | 114 ----------
 net/ipv4/multipath_rr.c                    |  95 ---------
 net/ipv4/multipath_wrandom.c               | 329 -----------------------------
 net/ipv4/route.c                           | 259 +----------------------
 18 files changed, 12 insertions(+), 1314 deletions(-)
 delete mode 100644 include/linux/ip_mp_alg.h
 delete mode 100644 include/net/ip_mp_alg.h
 delete mode 100644 net/ipv4/multipath.c
 delete mode 100644 net/ipv4/multipath_drr.c
 delete mode 100644 net/ipv4/multipath_random.c
 delete mode 100644 net/ipv4/multipath_rr.c
 delete mode 100644 net/ipv4/multipath_wrandom.c

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 3a159dac04f5..484250dcdbe0 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -262,25 +262,6 @@ Who:	Richard Purdie <rpurdie@rpsys.net>
 
 ---------------------------
 
-What:	Multipath cached routing support in ipv4
-When:	in 2.6.23
-Why:	Code was merged, then submitter immediately disappeared leaving
-	us with no maintainer and lots of bugs.  The code should not have
-	been merged in the first place, and many aspects of it's
-	implementation are blocking more critical core networking
-	development.  It's marked EXPERIMENTAL and no distribution
-	enables it because it cause obscure crashes due to unfixable bugs
-	(interfaces don't return errors so memory allocation can't be
-	handled, calling contexts of these interfaces make handling
-	errors impossible too because they get called after we've
-	totally commited to creating a route object, for example).
-	This problem has existed for years and no forward progress
-	has ever been made, and nobody steps up to try and salvage
-	this code, so we're going to finally just get rid of it.
-Who:	David S. Miller <davem@davemloft.net>
-
----------------------------
-
 What:	read_dev_chars(), read_conf_data{,_lpm}() (s390 common I/O layer)
 When:	December 2007
 Why:	These functions are a leftover from 2.4 times. They have several
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index afae306b177c..d94451682761 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -91,7 +91,6 @@ header-y += in6.h
 header-y += in_route.h
 header-y += ioctl.h
 header-y += ipmi_msgdefs.h
-header-y += ip_mp_alg.h
 header-y += ipsec.h
 header-y += ipx.h
 header-y += irda.h
diff --git a/include/linux/ip_mp_alg.h b/include/linux/ip_mp_alg.h
deleted file mode 100644
index e234e2008f5d..000000000000
--- a/include/linux/ip_mp_alg.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* ip_mp_alg.h: IPV4 multipath algorithm support, user-visible values.
- *
- * Copyright (C) 2004, 2005 Einar Lueck <elueck@de.ibm.com>
- * Copyright (C) 2005 David S. Miller <davem@davemloft.net>
- */
-
-#ifndef _LINUX_IP_MP_ALG_H
-#define _LINUX_IP_MP_ALG_H
-
-enum ip_mp_alg {
-	IP_MP_ALG_NONE,
-	IP_MP_ALG_RR,
-	IP_MP_ALG_DRR,
-	IP_MP_ALG_RANDOM,
-	IP_MP_ALG_WRANDOM,
-	__IP_MP_ALG_MAX
-};
-
-#define IP_MP_ALG_MAX (__IP_MP_ALG_MAX - 1)
-
-#endif /* _LINUX_IP_MP_ALG_H */
-
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 1fae30af91f3..612785848532 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -261,7 +261,7 @@ enum rtattr_type_t
 	RTA_FLOW,
 	RTA_CACHEINFO,
 	RTA_SESSION,
-	RTA_MP_ALGO,
+	RTA_MP_ALGO, /* no longer used */
 	RTA_TABLE,
 	__RTA_MAX
 };
diff --git a/include/net/dst.h b/include/net/dst.h
index 82270f9332db..e9ff4a4caef9 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -47,7 +47,6 @@ struct dst_entry
 #define DST_NOXFRM		2
 #define DST_NOPOLICY		4
 #define DST_NOHASH		8
-#define DST_BALANCED            0x10
 	unsigned long		expires;
 
 	unsigned short		header_len;	/* more space at head required */
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 69252cbe05b0..8cadc77c7df4 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -39,7 +39,6 @@ struct fib_config {
 	int			fc_mx_len;
 	int			fc_mp_len;
 	u32			fc_flow;
-	u32			fc_mp_alg;
 	u32			fc_nlflags;
 	struct nl_info		fc_nlinfo;
  };
@@ -85,9 +84,6 @@ struct fib_info {
 	int			fib_nhs;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	int			fib_power;
-#endif
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	u32			fib_mp_alg;
 #endif
 	struct fib_nh		fib_nh[0];
 #define fib_dev		fib_nh[0].nh_dev
@@ -103,10 +99,6 @@ struct fib_result {
 	unsigned char	nh_sel;
 	unsigned char	type;
 	unsigned char	scope;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	__be32          network;
-	__be32          netmask;
-#endif
 	struct fib_info *fi;
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	struct fib_rule	*r;
@@ -145,14 +137,6 @@ struct fib_result_nl {
 #define FIB_RES_DEV(res)		(FIB_RES_NH(res).nh_dev)
 #define FIB_RES_OIF(res)		(FIB_RES_NH(res).nh_oif)
 
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-#define FIB_RES_NETWORK(res)		((res).network)
-#define FIB_RES_NETMASK(res)	        ((res).netmask)
-#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
-#define FIB_RES_NETWORK(res)		(0)
-#define FIB_RES_NETMASK(res)	        (0)
-#endif /* CONFIG_IP_ROUTE_MULTIPATH_WRANDOM */
-
 struct fib_table {
 	struct hlist_node tb_hlist;
 	u32		tb_id;
diff --git a/include/net/ip_mp_alg.h b/include/net/ip_mp_alg.h
deleted file mode 100644
index 25b56571e54b..000000000000
--- a/include/net/ip_mp_alg.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* ip_mp_alg.h: IPV4 multipath algorithm support.
- *
- * Copyright (C) 2004, 2005 Einar Lueck <elueck@de.ibm.com>
- * Copyright (C) 2005 David S. Miller <davem@davemloft.net>
- */
-
-#ifndef _NET_IP_MP_ALG_H
-#define _NET_IP_MP_ALG_H
-
-#include <linux/ip_mp_alg.h>
-#include <net/flow.h>
-#include <net/route.h>
-
-struct fib_nh;
-
-struct ip_mp_alg_ops {
-	void	(*mp_alg_select_route)(const struct flowi *flp,
-				       struct rtable *rth, struct rtable **rp);
-	void	(*mp_alg_flush)(void);
-	void	(*mp_alg_set_nhinfo)(__be32 network, __be32 netmask,
-				     unsigned char prefixlen,
-				     const struct fib_nh *nh);
-	void	(*mp_alg_remove)(struct rtable *rth);
-};
-
-extern int multipath_alg_register(struct ip_mp_alg_ops *, enum ip_mp_alg);
-extern void multipath_alg_unregister(struct ip_mp_alg_ops *, enum ip_mp_alg);
-
-extern struct ip_mp_alg_ops *ip_mp_alg_table[];
-
-static inline int multipath_select_route(const struct flowi *flp,
-					 struct rtable *rth,
-					 struct rtable **rp)
-{
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	struct ip_mp_alg_ops *ops = ip_mp_alg_table[rth->rt_multipath_alg];
-
-	/* mp_alg_select_route _MUST_ be implemented */
-	if (ops && (rth->u.dst.flags & DST_BALANCED)) {
-		ops->mp_alg_select_route(flp, rth, rp);
-		return 1;
-	}
-#endif
-	return 0;
-}
-
-static inline void multipath_flush(void)
-{
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	int i;
-
-	for (i = IP_MP_ALG_NONE; i <= IP_MP_ALG_MAX; i++) {
-		struct ip_mp_alg_ops *ops = ip_mp_alg_table[i];
-
-		if (ops && ops->mp_alg_flush)
-			ops->mp_alg_flush();
-	}
-#endif
-}
-
-static inline void multipath_set_nhinfo(struct rtable *rth,
-					__be32 network, __be32 netmask,
-					unsigned char prefixlen,
-					const struct fib_nh *nh)
-{
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	struct ip_mp_alg_ops *ops = ip_mp_alg_table[rth->rt_multipath_alg];
-
-	if (ops && ops->mp_alg_set_nhinfo)
-		ops->mp_alg_set_nhinfo(network, netmask, prefixlen, nh);
-#endif
-}
-
-static inline void multipath_remove(struct rtable *rth)
-{
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	struct ip_mp_alg_ops *ops = ip_mp_alg_table[rth->rt_multipath_alg];
-
-	if (ops && ops->mp_alg_remove &&
-	    (rth->u.dst.flags & DST_BALANCED))
-		ops->mp_alg_remove(rth);
-#endif
-}
-
-static inline int multipath_comparekeys(const struct flowi *flp1,
-					const struct flowi *flp2)
-{
-	return flp1->fl4_dst == flp2->fl4_dst &&
-		flp1->fl4_src == flp2->fl4_src &&
-		flp1->oif == flp2->oif &&
-		flp1->mark == flp2->mark &&
-		!((flp1->fl4_tos ^ flp2->fl4_tos) &
-		  (IPTOS_RT_MASK | RTO_ONLINK));
-}
-
-#endif /* _NET_IP_MP_ALG_H */
diff --git a/include/net/route.h b/include/net/route.h
index 749e4dfe5ff3..188b8936bb27 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -62,7 +62,6 @@ struct rtable
 	
 	unsigned		rt_flags;
 	__u16			rt_type;
-	__u16			rt_multipath_alg;
 
 	__be32			rt_dst;	/* Path destination	*/
 	__be32			rt_src;	/* Path source		*/
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 010fbb2d45e9..fb7909774254 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -116,48 +116,6 @@ config IP_ROUTE_MULTIPATH
 	  equal "cost" and chooses one of them in a non-deterministic fashion
 	  if a matching packet arrives.
 
-config IP_ROUTE_MULTIPATH_CACHED
-	bool "IP: equal cost multipath with caching support (EXPERIMENTAL)"
-	depends on IP_ROUTE_MULTIPATH
-	help
-	  Normally, equal cost multipath routing is not supported by the
-	  routing cache. If you say Y here, alternative routes are cached
-	  and on cache lookup a route is chosen in a configurable fashion.
-
-	  If unsure, say N.
-
-config IP_ROUTE_MULTIPATH_RR
-	tristate "MULTIPATH: round robin algorithm"
-	depends on IP_ROUTE_MULTIPATH_CACHED
-	help
-	  Multipath routes are chosen according to Round Robin
-
-config IP_ROUTE_MULTIPATH_RANDOM
-	tristate "MULTIPATH: random algorithm"
-	depends on IP_ROUTE_MULTIPATH_CACHED
-	help
-	  Multipath routes are chosen in a random fashion. Actually,
-	  there is no weight for a route. The advantage of this policy
-	  is that it is implemented stateless and therefore introduces only
-	  a very small delay.
-
-config IP_ROUTE_MULTIPATH_WRANDOM
-	tristate "MULTIPATH: weighted random algorithm"
-	depends on IP_ROUTE_MULTIPATH_CACHED
-	help
-	  Multipath routes are chosen in a weighted random fashion. 
-	  The per route weights are the weights visible via ip route 2. As the
-	  corresponding state management introduces some overhead routing delay
-	  is increased.
-
-config IP_ROUTE_MULTIPATH_DRR
-	tristate "MULTIPATH: interface round robin algorithm"
-	depends on IP_ROUTE_MULTIPATH_CACHED
-	help
-	  Connections are distributed in a round robin fashion over the
-	  available interfaces. This policy makes sense if the connections 
-	  should be primarily distributed on interfaces and not on routes. 
-
 config IP_ROUTE_VERBOSE
 	bool "IP: verbose route monitoring"
 	depends on IP_ADVANCED_ROUTER
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 4ff6c151d7f3..fbf1674e0c2a 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -29,14 +29,9 @@ obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
 obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
 obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
 obj-$(CONFIG_IP_PNP) += ipconfig.o
-obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o
-obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o
-obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o
-obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o
 obj-$(CONFIG_NETFILTER)	+= netfilter.o netfilter/
 obj-$(CONFIG_IP_VS) += ipvs/
 obj-$(CONFIG_INET_DIAG) += inet_diag.o 
-obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
 obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 311d633f7f39..2eb909be8041 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -453,7 +453,6 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
 	[RTA_PROTOINFO]		= { .type = NLA_U32 },
 	[RTA_FLOW]		= { .type = NLA_U32 },
-	[RTA_MP_ALGO]		= { .type = NLA_U32 },
 };
 
 static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -515,9 +514,6 @@ static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh,
 		case RTA_FLOW:
 			cfg->fc_flow = nla_get_u32(attr);
 			break;
-		case RTA_MP_ALGO:
-			cfg->fc_mp_alg = nla_get_u32(attr);
-			break;
 		case RTA_TABLE:
 			cfg->fc_table = nla_get_u32(attr);
 			break;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index bb94550d95c3..c434119deb52 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -42,7 +42,6 @@
 #include <net/tcp.h>
 #include <net/sock.h>
 #include <net/ip_fib.h>
-#include <net/ip_mp_alg.h>
 #include <net/netlink.h>
 #include <net/nexthop.h>
 
@@ -697,13 +696,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 			goto err_inval;
 	}
 #endif
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	if (cfg->fc_mp_alg) {
-		if (cfg->fc_mp_alg < IP_MP_ALG_NONE ||
-		    cfg->fc_mp_alg > IP_MP_ALG_MAX)
-			goto err_inval;
-	}
-#endif
 
 	err = -ENOBUFS;
 	if (fib_info_cnt >= fib_hash_size) {
@@ -791,10 +783,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 #endif
 	}
 
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	fi->fib_mp_alg = cfg->fc_mp_alg;
-#endif
-
 	if (fib_props[cfg->fc_type].error) {
 		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
 			goto err_inval;
@@ -940,10 +928,6 @@ out_fill_res:
 	res->type = fa->fa_type;
 	res->scope = fa->fa_scope;
 	res->fi = fa->fa_info;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	res->netmask = mask;
-	res->network = zone & inet_make_mask(prefixlen);
-#endif
 	atomic_inc(&res->fi->fib_clntref);
 	return 0;
 }
diff --git a/net/ipv4/multipath.c b/net/ipv4/multipath.c
deleted file mode 100644
index 4e9ca7c76407..000000000000
--- a/net/ipv4/multipath.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/* multipath.c: IPV4 multipath algorithm support.
- *
- * Copyright (C) 2004, 2005 Einar Lueck <elueck@de.ibm.com>
- * Copyright (C) 2005 David S. Miller <davem@davemloft.net>
- */
-
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/netdevice.h>
-#include <linux/spinlock.h>
-
-#include <net/ip_mp_alg.h>
-
-static DEFINE_SPINLOCK(alg_table_lock);
-struct ip_mp_alg_ops *ip_mp_alg_table[IP_MP_ALG_MAX + 1];
-
-int multipath_alg_register(struct ip_mp_alg_ops *ops, enum ip_mp_alg n)
-{
-	struct ip_mp_alg_ops **slot;
-	int err;
-
-	if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX ||
-	    !ops->mp_alg_select_route)
-		return -EINVAL;
-
-	spin_lock(&alg_table_lock);
-	slot = &ip_mp_alg_table[n];
-	if (*slot != NULL) {
-		err = -EBUSY;
-	} else {
-		*slot = ops;
-		err = 0;
-	}
-	spin_unlock(&alg_table_lock);
-
-	return err;
-}
-EXPORT_SYMBOL(multipath_alg_register);
-
-void multipath_alg_unregister(struct ip_mp_alg_ops *ops, enum ip_mp_alg n)
-{
-	struct ip_mp_alg_ops **slot;
-
-	if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX)
-		return;
-
-	spin_lock(&alg_table_lock);
-	slot = &ip_mp_alg_table[n];
-	if (*slot == ops)
-		*slot = NULL;
-	spin_unlock(&alg_table_lock);
-
-	synchronize_net();
-}
-EXPORT_SYMBOL(multipath_alg_unregister);
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
deleted file mode 100644
index b03c5ca2c823..000000000000
--- a/net/ipv4/multipath_drr.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- *              Device round robin policy for multipath.
- *
- *
- * Version:	$Id: multipath_drr.c,v 1.1.2.1 2004/09/16 07:42:34 elueck Exp $
- *
- * Authors:	Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		as published by the Free Software Foundation; either version
- *		2 of the License, or (at your option) any later version.
- */
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/kernel.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/igmp.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/module.h>
-#include <linux/mroute.h>
-#include <linux/init.h>
-#include <net/ip.h>
-#include <net/protocol.h>
-#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/raw.h>
-#include <linux/notifier.h>
-#include <linux/if_arp.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/ipip.h>
-#include <net/checksum.h>
-#include <net/ip_mp_alg.h>
-
-struct multipath_device {
-	int		ifi; /* interface index of device */
-	atomic_t	usecount;
-	int 		allocated;
-};
-
-#define MULTIPATH_MAX_DEVICECANDIDATES 10
-
-static struct multipath_device state[MULTIPATH_MAX_DEVICECANDIDATES];
-static DEFINE_SPINLOCK(state_lock);
-
-static int inline __multipath_findslot(void)
-{
-	int i;
-
-	for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) {
-		if (state[i].allocated == 0)
-			return i;
-	}
-	return -1;
-}
-
-static int inline __multipath_finddev(int ifindex)
-{
-	int i;
-
-	for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) {
-		if (state[i].allocated != 0 &&
-		    state[i].ifi == ifindex)
-			return i;
-	}
-	return -1;
-}
-
-static int drr_dev_event(struct notifier_block *this,
-			 unsigned long event, void *ptr)
-{
-	struct net_device *dev = ptr;
-	int devidx;
-
-	switch (event) {
-	case NETDEV_UNREGISTER:
-	case NETDEV_DOWN:
-		spin_lock_bh(&state_lock);
-
-		devidx = __multipath_finddev(dev->ifindex);
-		if (devidx != -1) {
-			state[devidx].allocated = 0;
-			state[devidx].ifi = 0;
-			atomic_set(&state[devidx].usecount, 0);
-		}
-
-		spin_unlock_bh(&state_lock);
-		break;
-	}
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block drr_dev_notifier = {
-	.notifier_call	= drr_dev_event,
-};
-
-
-static void drr_safe_inc(atomic_t *usecount)
-{
-	int n;
-
-	atomic_inc(usecount);
-
-	n = atomic_read(usecount);
-	if (n <= 0) {
-		int i;
-
-		spin_lock_bh(&state_lock);
-
-		for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++)
-			atomic_set(&state[i].usecount, 0);
-
-		spin_unlock_bh(&state_lock);
-	}
-}
-
-static void drr_select_route(const struct flowi *flp,
-			     struct rtable *first, struct rtable **rp)
-{
-	struct rtable *nh, *result, *cur_min;
-	int min_usecount = -1;
-	int devidx = -1;
-	int cur_min_devidx = -1;
-
-	/* 1. make sure all alt. nexthops have the same GC related data */
-	/* 2. determine the new candidate to be returned */
-	result = NULL;
-	cur_min = NULL;
-	for (nh = rcu_dereference(first); nh;
-	     nh = rcu_dereference(nh->u.dst.rt_next)) {
-		if ((nh->u.dst.flags & DST_BALANCED) != 0 &&
-		    multipath_comparekeys(&nh->fl, flp)) {
-			int nh_ifidx = nh->u.dst.dev->ifindex;
-
-			nh->u.dst.lastuse = jiffies;
-			nh->u.dst.__use++;
-			if (result != NULL)
-				continue;
-
-			/* search for the output interface */
-
-			/* this is not SMP safe, only add/remove are
-			 * SMP safe as wrong usecount updates have no big
-			 * impact
-			 */
-			devidx = __multipath_finddev(nh_ifidx);
-			if (devidx == -1) {
-				/* add the interface to the array
-				 * SMP safe
-				 */
-				spin_lock_bh(&state_lock);
-
-				/* due to SMP: search again */
-				devidx = __multipath_finddev(nh_ifidx);
-				if (devidx == -1) {
-					/* add entry for device */
-					devidx = __multipath_findslot();
-					if (devidx == -1) {
-						/* unlikely but possible */
-						continue;
-					}
-
-					state[devidx].allocated = 1;
-					state[devidx].ifi = nh_ifidx;
-					atomic_set(&state[devidx].usecount, 0);
-					min_usecount = 0;
-				}
-
-				spin_unlock_bh(&state_lock);
-			}
-
-			if (min_usecount == 0) {
-				/* if the device has not been used it is
-				 * the primary target
-				 */
-				drr_safe_inc(&state[devidx].usecount);
-				result = nh;
-			} else {
-				int count =
-					atomic_read(&state[devidx].usecount);
-
-				if (min_usecount == -1 ||
-				    count < min_usecount) {
-					cur_min = nh;
-					cur_min_devidx = devidx;
-					min_usecount = count;
-				}
-			}
-		}
-	}
-
-	if (!result) {
-		if (cur_min) {
-			drr_safe_inc(&state[cur_min_devidx].usecount);
-			result = cur_min;
-		} else {
-			result = first;
-		}
-	}
-
-	*rp = result;
-}
-
-static struct ip_mp_alg_ops drr_ops = {
-	.mp_alg_select_route	=	drr_select_route,
-};
-
-static int __init drr_init(void)
-{
-	int err = register_netdevice_notifier(&drr_dev_notifier);
-
-	if (err)
-		return err;
-
-	err = multipath_alg_register(&drr_ops, IP_MP_ALG_DRR);
-	if (err)
-		goto fail;
-
-	return 0;
-
-fail:
-	unregister_netdevice_notifier(&drr_dev_notifier);
-	return err;
-}
-
-static void __exit drr_exit(void)
-{
-	unregister_netdevice_notifier(&drr_dev_notifier);
-	multipath_alg_unregister(&drr_ops, IP_MP_ALG_DRR);
-}
-
-module_init(drr_init);
-module_exit(drr_exit);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_random.c b/net/ipv4/multipath_random.c
deleted file mode 100644
index c312785d14d0..000000000000
--- a/net/ipv4/multipath_random.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- *              Random policy for multipath.
- *
- *
- * Version:	$Id: multipath_random.c,v 1.1.2.3 2004/09/21 08:42:11 elueck Exp $
- *
- * Authors:	Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		as published by the Free Software Foundation; either version
- *		2 of the License, or (at your option) any later version.
- */
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/kernel.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/igmp.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/module.h>
-#include <linux/mroute.h>
-#include <linux/init.h>
-#include <linux/random.h>
-#include <net/ip.h>
-#include <net/protocol.h>
-#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/raw.h>
-#include <linux/notifier.h>
-#include <linux/if_arp.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/ipip.h>
-#include <net/checksum.h>
-#include <net/ip_mp_alg.h>
-
-#define MULTIPATH_MAX_CANDIDATES 40
-
-static void random_select_route(const struct flowi *flp,
-				struct rtable *first,
-				struct rtable **rp)
-{
-	struct rtable *rt;
-	struct rtable *decision;
-	unsigned char candidate_count = 0;
-
-	/* count all candidate */
-	for (rt = rcu_dereference(first); rt;
-	     rt = rcu_dereference(rt->u.dst.rt_next)) {
-		if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
-		    multipath_comparekeys(&rt->fl, flp))
-			++candidate_count;
-	}
-
-	/* choose a random candidate */
-	decision = first;
-	if (candidate_count > 1) {
-		unsigned char i = 0;
-		unsigned char candidate_no = (unsigned char)
-			(random32() % candidate_count);
-
-		/* find chosen candidate and adjust GC data for all candidates
-		 * to ensure they stay in cache
-		 */
-		for (rt = first; rt; rt = rt->u.dst.rt_next) {
-			if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
-			    multipath_comparekeys(&rt->fl, flp)) {
-				rt->u.dst.lastuse = jiffies;
-
-				if (i == candidate_no)
-					decision = rt;
-
-				if (i >= candidate_count)
-					break;
-
-				i++;
-			}
-		}
-	}
-
-	decision->u.dst.__use++;
-	*rp = decision;
-}
-
-static struct ip_mp_alg_ops random_ops = {
-	.mp_alg_select_route	=	random_select_route,
-};
-
-static int __init random_init(void)
-{
-	return multipath_alg_register(&random_ops, IP_MP_ALG_RANDOM);
-}
-
-static void __exit random_exit(void)
-{
-	multipath_alg_unregister(&random_ops, IP_MP_ALG_RANDOM);
-}
-
-module_init(random_init);
-module_exit(random_exit);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_rr.c b/net/ipv4/multipath_rr.c
deleted file mode 100644
index 0ad22524f450..000000000000
--- a/net/ipv4/multipath_rr.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- *              Round robin policy for multipath.
- *
- *
- * Version:	$Id: multipath_rr.c,v 1.1.2.2 2004/09/16 07:42:34 elueck Exp $
- *
- * Authors:	Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		as published by the Free Software Foundation; either version
- *		2 of the License, or (at your option) any later version.
- */
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/kernel.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/igmp.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/module.h>
-#include <linux/mroute.h>
-#include <linux/init.h>
-#include <net/ip.h>
-#include <net/protocol.h>
-#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/raw.h>
-#include <linux/notifier.h>
-#include <linux/if_arp.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/ipip.h>
-#include <net/checksum.h>
-#include <net/ip_mp_alg.h>
-
-static void rr_select_route(const struct flowi *flp,
-			    struct rtable *first, struct rtable **rp)
-{
-	struct rtable *nh, *result, *min_use_cand = NULL;
-	int min_use = -1;
-
-	/* 1. make sure all alt. nexthops have the same GC related data
-	 * 2. determine the new candidate to be returned
-	 */
-	result = NULL;
-	for (nh = rcu_dereference(first); nh;
-	     nh = rcu_dereference(nh->u.dst.rt_next)) {
-		if ((nh->u.dst.flags & DST_BALANCED) != 0 &&
-		    multipath_comparekeys(&nh->fl, flp)) {
-			nh->u.dst.lastuse = jiffies;
-
-			if (min_use == -1 || nh->u.dst.__use < min_use) {
-				min_use = nh->u.dst.__use;
-				min_use_cand = nh;
-			}
-		}
-	}
-	result = min_use_cand;
-	if (!result)
-		result = first;
-
-	result->u.dst.__use++;
-	*rp = result;
-}
-
-static struct ip_mp_alg_ops rr_ops = {
-	.mp_alg_select_route	=	rr_select_route,
-};
-
-static int __init rr_init(void)
-{
-	return multipath_alg_register(&rr_ops, IP_MP_ALG_RR);
-}
-
-static void __exit rr_exit(void)
-{
-	multipath_alg_unregister(&rr_ops, IP_MP_ALG_RR);
-}
-
-module_init(rr_init);
-module_exit(rr_exit);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c
deleted file mode 100644
index 57c503694539..000000000000
--- a/net/ipv4/multipath_wrandom.c
+++ /dev/null
@@ -1,329 +0,0 @@
-/*
- *              Weighted random policy for multipath.
- *
- *
- * Version:	$Id: multipath_wrandom.c,v 1.1.2.3 2004/09/22 07:51:40 elueck Exp $
- *
- * Authors:	Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		as published by the Free Software Foundation; either version
- *		2 of the License, or (at your option) any later version.
- */
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/kernel.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/igmp.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/module.h>
-#include <linux/mroute.h>
-#include <linux/init.h>
-#include <linux/random.h>
-#include <net/ip.h>
-#include <net/protocol.h>
-#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/raw.h>
-#include <linux/notifier.h>
-#include <linux/if_arp.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/ipip.h>
-#include <net/checksum.h>
-#include <net/ip_fib.h>
-#include <net/ip_mp_alg.h>
-
-#define MULTIPATH_STATE_SIZE 15
-
-struct multipath_candidate {
-	struct multipath_candidate	*next;
-	int				power;
-	struct rtable			*rt;
-};
-
-struct multipath_dest {
-	struct list_head	list;
-
-	const struct fib_nh	*nh_info;
-	__be32			netmask;
-	__be32			network;
-	unsigned char		prefixlen;
-
-	struct rcu_head		rcu;
-};
-
-struct multipath_bucket {
-	struct list_head	head;
-	spinlock_t		lock;
-};
-
-struct multipath_route {
-	struct list_head	list;
-
-	int			oif;
-	__be32			gw;
-	struct list_head	dests;
-
-	struct rcu_head		rcu;
-};
-
-/* state: primarily weight per route information */
-static struct multipath_bucket state[MULTIPATH_STATE_SIZE];
-
-static unsigned char __multipath_lookup_weight(const struct flowi *fl,
-					       const struct rtable *rt)
-{
-	const int state_idx = rt->idev->dev->ifindex % MULTIPATH_STATE_SIZE;
-	struct multipath_route *r;
-	struct multipath_route *target_route = NULL;
-	struct multipath_dest *d;
-	int weight = 1;
-
-	/* lookup the weight information for a certain route */
-	rcu_read_lock();
-
-	/* find state entry for gateway or add one if necessary */
-	list_for_each_entry_rcu(r, &state[state_idx].head, list) {
-		if (r->gw == rt->rt_gateway &&
-		    r->oif == rt->idev->dev->ifindex) {
-			target_route = r;
-			break;
-		}
-	}
-
-	if (!target_route) {
-		/* this should not happen... but we are prepared */
-		printk( KERN_CRIT"%s: missing state for gateway: %u and " \
-			"device %d\n", __FUNCTION__, rt->rt_gateway,
-			rt->idev->dev->ifindex);
-		goto out;
-	}
-
-	/* find state entry for destination */
-	list_for_each_entry_rcu(d, &target_route->dests, list) {
-		__be32 targetnetwork = fl->fl4_dst &
-			inet_make_mask(d->prefixlen);
-
-		if ((targetnetwork & d->netmask) == d->network) {
-			weight = d->nh_info->nh_weight;
-			goto out;
-		}
-	}
-
-out:
-	rcu_read_unlock();
-	return weight;
-}
-
-static void wrandom_init_state(void)
-{
-	int i;
-
-	for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) {
-		INIT_LIST_HEAD(&state[i].head);
-		spin_lock_init(&state[i].lock);
-	}
-}
-
-static void wrandom_select_route(const struct flowi *flp,
-				 struct rtable *first,
-				 struct rtable **rp)
-{
-	struct rtable *rt;
-	struct rtable *decision;
-	struct multipath_candidate *first_mpc = NULL;
-	struct multipath_candidate *mpc, *last_mpc = NULL;
-	int power = 0;
-	int last_power;
-	int selector;
-	const size_t size_mpc = sizeof(struct multipath_candidate);
-
-	/* collect all candidates and identify their weights */
-	for (rt = rcu_dereference(first); rt;
-	     rt = rcu_dereference(rt->u.dst.rt_next)) {
-		if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
-		    multipath_comparekeys(&rt->fl, flp)) {
-			struct multipath_candidate* mpc =
-				(struct multipath_candidate*)
-				kmalloc(size_mpc, GFP_ATOMIC);
-
-			if (!mpc)
-				return;
-
-			power += __multipath_lookup_weight(flp, rt) * 10000;
-
-			mpc->power = power;
-			mpc->rt = rt;
-			mpc->next = NULL;
-
-			if (!first_mpc)
-				first_mpc = mpc;
-			else
-				last_mpc->next = mpc;
-
-			last_mpc = mpc;
-		}
-	}
-
-	/* choose a weighted random candidate */
-	decision = first;
-	selector = random32() % power;
-	last_power = 0;
-
-	/* select candidate, adjust GC data and cleanup local state */
-	decision = first;
-	last_mpc = NULL;
-	for (mpc = first_mpc; mpc; mpc = mpc->next) {
-		mpc->rt->u.dst.lastuse = jiffies;
-		if (last_power <= selector && selector < mpc->power)
-			decision = mpc->rt;
-
-		last_power = mpc->power;
-		kfree(last_mpc);
-		last_mpc = mpc;
-	}
-
-	/* concurrent __multipath_flush may lead to !last_mpc */
-	kfree(last_mpc);
-
-	decision->u.dst.__use++;
-	*rp = decision;
-}
-
-static void wrandom_set_nhinfo(__be32 network,
-			       __be32 netmask,
-			       unsigned char prefixlen,
-			       const struct fib_nh *nh)
-{
-	const int state_idx = nh->nh_oif % MULTIPATH_STATE_SIZE;
-	struct multipath_route *r, *target_route = NULL;
-	struct multipath_dest *d, *target_dest = NULL;
-
-	/* store the weight information for a certain route */
-	spin_lock_bh(&state[state_idx].lock);
-
-	/* find state entry for gateway or add one if necessary */
-	list_for_each_entry_rcu(r, &state[state_idx].head, list) {
-		if (r->gw == nh->nh_gw && r->oif == nh->nh_oif) {
-			target_route = r;
-			break;
-		}
-	}
-
-	if (!target_route) {
-		const size_t size_rt = sizeof(struct multipath_route);
-		target_route = (struct multipath_route *)
-			kmalloc(size_rt, GFP_ATOMIC);
-
-		target_route->gw = nh->nh_gw;
-		target_route->oif = nh->nh_oif;
-		memset(&target_route->rcu, 0, sizeof(struct rcu_head));
-		INIT_LIST_HEAD(&target_route->dests);
-
-		list_add_rcu(&target_route->list, &state[state_idx].head);
-	}
-
-	/* find state entry for destination or add one if necessary */
-	list_for_each_entry_rcu(d, &target_route->dests, list) {
-		if (d->nh_info == nh) {
-			target_dest = d;
-			break;
-		}
-	}
-
-	if (!target_dest) {
-		const size_t size_dst = sizeof(struct multipath_dest);
-		target_dest = (struct multipath_dest*)
-			kmalloc(size_dst, GFP_ATOMIC);
-
-		target_dest->nh_info = nh;
-		target_dest->network = network;
-		target_dest->netmask = netmask;
-		target_dest->prefixlen = prefixlen;
-		memset(&target_dest->rcu, 0, sizeof(struct rcu_head));
-
-		list_add_rcu(&target_dest->list, &target_route->dests);
-	}
-	/* else: we already stored this info for another destination =>
-	 * we are finished
-	 */
-
-	spin_unlock_bh(&state[state_idx].lock);
-}
-
-static void __multipath_free(struct rcu_head *head)
-{
-	struct multipath_route *rt = container_of(head, struct multipath_route,
-						  rcu);
-	kfree(rt);
-}
-
-static void __multipath_free_dst(struct rcu_head *head)
-{
-	struct multipath_dest *dst = container_of(head,
-						  struct multipath_dest,
-						  rcu);
-	kfree(dst);
-}
-
-static void wrandom_flush(void)
-{
-	int i;
-
-	/* defere delete to all entries */
-	for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) {
-		struct multipath_route *r;
-
-		spin_lock_bh(&state[i].lock);
-		list_for_each_entry_rcu(r, &state[i].head, list) {
-			struct multipath_dest *d;
-			list_for_each_entry_rcu(d, &r->dests, list) {
-				list_del_rcu(&d->list);
-				call_rcu(&d->rcu,
-					 __multipath_free_dst);
-			}
-			list_del_rcu(&r->list);
-			call_rcu(&r->rcu,
-				 __multipath_free);
-		}
-
-		spin_unlock_bh(&state[i].lock);
-	}
-}
-
-static struct ip_mp_alg_ops wrandom_ops = {
-	.mp_alg_select_route	=	wrandom_select_route,
-	.mp_alg_flush		=	wrandom_flush,
-	.mp_alg_set_nhinfo	=	wrandom_set_nhinfo,
-};
-
-static int __init wrandom_init(void)
-{
-	wrandom_init_state();
-
-	return multipath_alg_register(&wrandom_ops, IP_MP_ALG_WRANDOM);
-}
-
-static void __exit wrandom_exit(void)
-{
-	multipath_alg_unregister(&wrandom_ops, IP_MP_ALG_WRANDOM);
-}
-
-module_init(wrandom_init);
-module_exit(wrandom_exit);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 29ca63e81ced..85285021518b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -101,7 +101,6 @@
 #include <net/tcp.h>
 #include <net/icmp.h>
 #include <net/xfrm.h>
-#include <net/ip_mp_alg.h>
 #include <net/netevent.h>
 #include <net/rtnetlink.h>
 #ifdef CONFIG_SYSCTL
@@ -495,13 +494,11 @@ static const struct file_operations rt_cpu_seq_fops = {
 
 static __inline__ void rt_free(struct rtable *rt)
 {
-	multipath_remove(rt);
 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 }
 
 static __inline__ void rt_drop(struct rtable *rt)
 {
-	multipath_remove(rt);
 	ip_rt_put(rt);
 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 }
@@ -574,52 +571,6 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 		(fl1->iif ^ fl2->iif)) == 0;
 }
 
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
-						struct rtable *expentry,
-						int *removed_count)
-{
-	int passedexpired = 0;
-	struct rtable **nextstep = NULL;
-	struct rtable **rthp = chain_head;
-	struct rtable *rth;
-
-	if (removed_count)
-		*removed_count = 0;
-
-	while ((rth = *rthp) != NULL) {
-		if (rth == expentry)
-			passedexpired = 1;
-
-		if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
-		    compare_keys(&(*rthp)->fl, &expentry->fl)) {
-			if (*rthp == expentry) {
-				*rthp = rth->u.dst.rt_next;
-				continue;
-			} else {
-				*rthp = rth->u.dst.rt_next;
-				rt_free(rth);
-				if (removed_count)
-					++(*removed_count);
-			}
-		} else {
-			if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
-			    passedexpired && !nextstep)
-				nextstep = &rth->u.dst.rt_next;
-
-			rthp = &rth->u.dst.rt_next;
-		}
-	}
-
-	rt_free(expentry);
-	if (removed_count)
-		++(*removed_count);
-
-	return nextstep;
-}
-#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
-
-
 /* This runs via a timer and thus is always in BH context. */
 static void rt_check_expire(unsigned long dummy)
 {
@@ -658,22 +609,8 @@ static void rt_check_expire(unsigned long dummy)
 			}
 
 			/* Cleanup aged off entries. */
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-			/* remove all related balanced entries if necessary */
-			if (rth->u.dst.flags & DST_BALANCED) {
-				rthp = rt_remove_balanced_route(
-					&rt_hash_table[i].chain,
-					rth, NULL);
-				if (!rthp)
-					break;
-			} else {
-				*rthp = rth->u.dst.rt_next;
-				rt_free(rth);
-			}
-#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 			*rthp = rth->u.dst.rt_next;
 			rt_free(rth);
-#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 		}
 		spin_unlock(rt_hash_lock_addr(i));
 
@@ -721,9 +658,6 @@ void rt_cache_flush(int delay)
 	if (delay < 0)
 		delay = ip_rt_min_delay;
 
-	/* flush existing multipath state*/
-	multipath_flush();
-
 	spin_lock_bh(&rt_flush_lock);
 
 	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
@@ -842,30 +776,9 @@ static int rt_garbage_collect(void)
 					rthp = &rth->u.dst.rt_next;
 					continue;
 				}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-				/* remove all related balanced entries
-				 * if necessary
-				 */
-				if (rth->u.dst.flags & DST_BALANCED) {
-					int r;
-
-					rthp = rt_remove_balanced_route(
-						&rt_hash_table[k].chain,
-						rth,
-						&r);
-					goal -= r;
-					if (!rthp)
-						break;
-				} else {
-					*rthp = rth->u.dst.rt_next;
-					rt_free(rth);
-					goal--;
-				}
-#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 				*rthp = rth->u.dst.rt_next;
 				rt_free(rth);
 				goal--;
-#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 			}
 			spin_unlock_bh(rt_hash_lock_addr(k));
 			if (goal <= 0)
@@ -939,12 +852,7 @@ restart:
 
 	spin_lock_bh(rt_hash_lock_addr(hash));
 	while ((rth = *rthp) != NULL) {
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-		if (!(rth->u.dst.flags & DST_BALANCED) &&
-		    compare_keys(&rth->fl, &rt->fl)) {
-#else
 		if (compare_keys(&rth->fl, &rt->fl)) {
-#endif
 			/* Put it first */
 			*rthp = rth->u.dst.rt_next;
 			/*
@@ -1774,10 +1682,6 @@ static inline int __mkroute_input(struct sk_buff *skb,
 
 	atomic_set(&rth->u.dst.__refcnt, 1);
 	rth->u.dst.flags= DST_HOST;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	if (res->fi->fib_nhs > 1)
-		rth->u.dst.flags |= DST_BALANCED;
-#endif
 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
 		rth->u.dst.flags |= DST_NOPOLICY;
 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
@@ -1812,11 +1716,11 @@ static inline int __mkroute_input(struct sk_buff *skb,
 	return err;
 }
 
-static inline int ip_mkroute_input_def(struct sk_buff *skb,
-				       struct fib_result* res,
-				       const struct flowi *fl,
-				       struct in_device *in_dev,
-				       __be32 daddr, __be32 saddr, u32 tos)
+static inline int ip_mkroute_input(struct sk_buff *skb,
+				   struct fib_result* res,
+				   const struct flowi *fl,
+				   struct in_device *in_dev,
+				   __be32 daddr, __be32 saddr, u32 tos)
 {
 	struct rtable* rth = NULL;
 	int err;
@@ -1837,63 +1741,6 @@ static inline int ip_mkroute_input_def(struct sk_buff *skb,
 	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
 }
 
-static inline int ip_mkroute_input(struct sk_buff *skb,
-				   struct fib_result* res,
-				   const struct flowi *fl,
-				   struct in_device *in_dev,
-				   __be32 daddr, __be32 saddr, u32 tos)
-{
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	struct rtable* rth = NULL, *rtres;
-	unsigned char hop, hopcount;
-	int err = -EINVAL;
-	unsigned int hash;
-
-	if (res->fi)
-		hopcount = res->fi->fib_nhs;
-	else
-		hopcount = 1;
-
-	/* distinguish between multipath and singlepath */
-	if (hopcount < 2)
-		return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
-					    saddr, tos);
-
-	/* add all alternatives to the routing cache */
-	for (hop = 0; hop < hopcount; hop++) {
-		res->nh_sel = hop;
-
-		/* put reference to previous result */
-		if (hop)
-			ip_rt_put(rtres);
-
-		/* create a routing cache entry */
-		err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
-				      &rth);
-		if (err)
-			return err;
-
-		/* put it into the cache */
-		hash = rt_hash(daddr, saddr, fl->iif);
-		err = rt_intern_hash(hash, rth, &rtres);
-		if (err)
-			return err;
-
-		/* forward hop information to multipath impl. */
-		multipath_set_nhinfo(rth,
-				     FIB_RES_NETWORK(*res),
-				     FIB_RES_NETMASK(*res),
-				     res->prefixlen,
-				     &FIB_RES_NH(*res));
-	}
-	skb->dst = &rtres->u.dst;
-	return err;
-#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
-	return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
-#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
-}
-
-
 /*
  *	NOTE. We drop all the packets that has local source
  *	addresses, because every properly looped back packet
@@ -2211,13 +2058,6 @@ static inline int __mkroute_output(struct rtable **result,
 
 	atomic_set(&rth->u.dst.__refcnt, 1);
 	rth->u.dst.flags= DST_HOST;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	if (res->fi) {
-		rth->rt_multipath_alg = res->fi->fib_mp_alg;
-		if (res->fi->fib_nhs > 1)
-			rth->u.dst.flags |= DST_BALANCED;
-	}
-#endif
 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
 		rth->u.dst.flags |= DST_NOXFRM;
 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
@@ -2277,12 +2117,12 @@ static inline int __mkroute_output(struct rtable **result,
 	return err;
 }
 
-static inline int ip_mkroute_output_def(struct rtable **rp,
-					struct fib_result* res,
-					const struct flowi *fl,
-					const struct flowi *oldflp,
-					struct net_device *dev_out,
-					unsigned flags)
+static inline int ip_mkroute_output(struct rtable **rp,
+				    struct fib_result* res,
+				    const struct flowi *fl,
+				    const struct flowi *oldflp,
+				    struct net_device *dev_out,
+				    unsigned flags)
 {
 	struct rtable *rth = NULL;
 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
@@ -2295,68 +2135,6 @@ static inline int ip_mkroute_output_def(struct rtable **rp,
 	return err;
 }
 
-static inline int ip_mkroute_output(struct rtable** rp,
-				    struct fib_result* res,
-				    const struct flowi *fl,
-				    const struct flowi *oldflp,
-				    struct net_device *dev_out,
-				    unsigned flags)
-{
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	unsigned char hop;
-	unsigned hash;
-	int err = -EINVAL;
-	struct rtable *rth = NULL;
-
-	if (res->fi && res->fi->fib_nhs > 1) {
-		unsigned char hopcount = res->fi->fib_nhs;
-
-		for (hop = 0; hop < hopcount; hop++) {
-			struct net_device *dev2nexthop;
-
-			res->nh_sel = hop;
-
-			/* hold a work reference to the output device */
-			dev2nexthop = FIB_RES_DEV(*res);
-			dev_hold(dev2nexthop);
-
-			/* put reference to previous result */
-			if (hop)
-				ip_rt_put(*rp);
-
-			err = __mkroute_output(&rth, res, fl, oldflp,
-					       dev2nexthop, flags);
-
-			if (err != 0)
-				goto cleanup;
-
-			hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
-					oldflp->oif);
-			err = rt_intern_hash(hash, rth, rp);
-
-			/* forward hop information to multipath impl. */
-			multipath_set_nhinfo(rth,
-					     FIB_RES_NETWORK(*res),
-					     FIB_RES_NETMASK(*res),
-					     res->prefixlen,
-					     &FIB_RES_NH(*res));
-		cleanup:
-			/* release work reference to output device */
-			dev_put(dev2nexthop);
-
-			if (err != 0)
-				return err;
-		}
-		return err;
-	} else {
-		return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
-					     flags);
-	}
-#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
-	return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
-#endif
-}
-
 /*
  * Major route resolver routine.
  */
@@ -2570,17 +2348,6 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
 		    rth->fl.mark == flp->mark &&
 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
 			    (IPTOS_RT_MASK | RTO_ONLINK))) {
-
-			/* check for multipath routes and choose one if
-			 * necessary
-			 */
-			if (multipath_select_route(flp, rth, rp)) {
-				dst_hold(&(*rp)->u.dst);
-				RT_CACHE_STAT_INC(out_hit);
-				rcu_read_unlock_bh();
-				return 0;
-			}
-
 			rth->u.dst.lastuse = jiffies;
 			dst_hold(&rth->u.dst);
 			rth->u.dst.__use++;
@@ -2728,10 +2495,6 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
 #ifdef CONFIG_NET_CLS_ROUTE
 	if (rt->u.dst.tclassid)
 		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
-#endif
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
-		NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
 #endif
 	if (rt->fl.iif)
 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
-- 
cgit v1.3-8-gc7d7


From 8c7b7faaa630fef7f68d8728cee1cce398cc9697 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 10 Jul 2007 22:08:12 -0700
Subject: [NET]: Kill eth_copy_and_sum().

It hasn't "summed" anything in over 7 years, and it's
just a straight mempcy ala skb_copy_to_linear_data()
so just get rid of it.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/ppc/8260_io/enet.c             | 4 ++--
 arch/ppc/8260_io/fcc_enet.c         | 4 ++--
 arch/ppc/8xx_io/enet.c              | 4 ++--
 arch/ppc/8xx_io/fec.c               | 2 +-
 drivers/net/3c523.c                 | 2 +-
 drivers/net/7990.c                  | 4 ++--
 drivers/net/8139too.c               | 2 +-
 drivers/net/a2065.c                 | 4 ++--
 drivers/net/ariadne.c               | 2 +-
 drivers/net/arm/ep93xx_eth.c        | 2 +-
 drivers/net/au1000_eth.c            | 4 ++--
 drivers/net/dl2k.c                  | 4 ++--
 drivers/net/eepro100.c              | 2 +-
 drivers/net/epic100.c               | 2 +-
 drivers/net/fealnx.c                | 4 ++--
 drivers/net/fec.c                   | 2 +-
 drivers/net/hamachi.c               | 4 ++--
 drivers/net/ixp2000/ixpdev.c        | 2 +-
 drivers/net/lance.c                 | 4 ++--
 drivers/net/natsemi.c               | 4 ++--
 drivers/net/ni52.c                  | 2 +-
 drivers/net/ni65.c                  | 4 ++--
 drivers/net/pci-skeleton.c          | 2 +-
 drivers/net/pcnet32.c               | 4 ++--
 drivers/net/saa9730.c               | 4 ++--
 drivers/net/sgiseeq.c               | 2 +-
 drivers/net/sis190.c                | 2 +-
 drivers/net/starfire.c              | 2 +-
 drivers/net/sun3_82586.c            | 2 +-
 drivers/net/sun3lance.c             | 5 ++---
 drivers/net/sunbmac.c               | 2 +-
 drivers/net/sundance.c              | 2 +-
 drivers/net/sunlance.c              | 4 ++--
 drivers/net/sunqe.c                 | 4 ++--
 drivers/net/tulip/interrupt.c       | 8 ++++----
 drivers/net/tulip/winbond-840.c     | 2 +-
 drivers/net/tulip/xircom_cb.c       | 2 +-
 drivers/net/tulip/xircom_tulip_cb.c | 4 ++--
 drivers/net/typhoon.c               | 2 +-
 drivers/net/usb/catc.c              | 2 +-
 drivers/net/usb/kaweth.c            | 2 +-
 drivers/net/via-rhine.c             | 4 ++--
 drivers/net/wireless/wl3501_cs.c    | 2 +-
 drivers/net/yellowfin.c             | 2 +-
 include/linux/etherdevice.h         | 6 ------
 45 files changed, 66 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ppc/8260_io/enet.c b/arch/ppc/8260_io/enet.c
index 4c0a7d732f69..615b6583d9b0 100644
--- a/arch/ppc/8260_io/enet.c
+++ b/arch/ppc/8260_io/enet.c
@@ -477,9 +477,9 @@ for (;;) {
 		}
 		else {
 			skb_put(skb,pkt_len-4);	/* Make room */
-			eth_copy_and_sum(skb,
+			skb_copy_to_linear_data(skb,
 				(unsigned char *)__va(bdp->cbd_bufaddr),
-				pkt_len-4, 0);
+				pkt_len-4);
 			skb->protocol=eth_type_trans(skb,dev);
 			netif_rx(skb);
 		}
diff --git a/arch/ppc/8260_io/fcc_enet.c b/arch/ppc/8260_io/fcc_enet.c
index cab395da25da..6f3ed6a72e0b 100644
--- a/arch/ppc/8260_io/fcc_enet.c
+++ b/arch/ppc/8260_io/fcc_enet.c
@@ -734,9 +734,9 @@ for (;;) {
 		}
 		else {
 			skb_put(skb,pkt_len);	/* Make room */
-			eth_copy_and_sum(skb,
+			skb_copy_to_linear_data(skb,
 				(unsigned char *)__va(bdp->cbd_bufaddr),
-				pkt_len, 0);
+				pkt_len);
 			skb->protocol=eth_type_trans(skb,dev);
 			netif_rx(skb);
 		}
diff --git a/arch/ppc/8xx_io/enet.c b/arch/ppc/8xx_io/enet.c
index e58288e14369..703d47eee436 100644
--- a/arch/ppc/8xx_io/enet.c
+++ b/arch/ppc/8xx_io/enet.c
@@ -506,9 +506,9 @@ for (;;) {
 		}
 		else {
 			skb_put(skb,pkt_len-4);	/* Make room */
-			eth_copy_and_sum(skb,
+			skb_copy_to_linear_data(skb,
 				cep->rx_vaddr[bdp - cep->rx_bd_base],
-				pkt_len-4, 0);
+				pkt_len-4);
 			skb->protocol=eth_type_trans(skb,dev);
 			netif_rx(skb);
 		}
diff --git a/arch/ppc/8xx_io/fec.c b/arch/ppc/8xx_io/fec.c
index d38335d2d710..0288279be9aa 100644
--- a/arch/ppc/8xx_io/fec.c
+++ b/arch/ppc/8xx_io/fec.c
@@ -725,7 +725,7 @@ while (!(bdp->cbd_sc & BD_ENET_RX_EMPTY)) {
 		fep->stats.rx_dropped++;
 	} else {
 		skb_put(skb,pkt_len-4);	/* Make room */
-		eth_copy_and_sum(skb, data, pkt_len-4, 0);
+		skb_copy_to_linear_data(skb, data, pkt_len-4);
 		skb->protocol=eth_type_trans(skb,dev);
 		netif_rx(skb);
 	}
diff --git a/drivers/net/3c523.c b/drivers/net/3c523.c
index da1a22c13865..ab18343e58ef 100644
--- a/drivers/net/3c523.c
+++ b/drivers/net/3c523.c
@@ -990,7 +990,7 @@ static void elmc_rcv_int(struct net_device *dev)
 				if (skb != NULL) {
 					skb_reserve(skb, 2);	/* 16 byte alignment */
 					skb_put(skb,totlen);
-					eth_copy_and_sum(skb, (char *) p->base+(unsigned long) rbd->buffer,totlen,0);
+					skb_copy_to_linear_data(skb, (char *) p->base+(unsigned long) rbd->buffer,totlen);
 					skb->protocol = eth_type_trans(skb, dev);
 					netif_rx(skb);
 					dev->last_rx = jiffies;
diff --git a/drivers/net/7990.c b/drivers/net/7990.c
index 0877fc372f4b..e89ace109a5d 100644
--- a/drivers/net/7990.c
+++ b/drivers/net/7990.c
@@ -333,9 +333,9 @@ static int lance_rx (struct net_device *dev)
 
                         skb_reserve (skb, 2);           /* 16 byte align */
                         skb_put (skb, len);             /* make room */
-                        eth_copy_and_sum(skb,
+                        skb_copy_to_linear_data(skb,
                                          (unsigned char *)&(ib->rx_buf [lp->rx_new][0]),
-                                         len, 0);
+                                         len);
                         skb->protocol = eth_type_trans (skb, dev);
 			netif_rx (skb);
 			dev->last_rx = jiffies;
diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c
index a844b1fe2dc4..21a6ccbf92e0 100644
--- a/drivers/net/8139too.c
+++ b/drivers/net/8139too.c
@@ -2017,7 +2017,7 @@ no_early_rx:
 #if RX_BUF_IDX == 3
 			wrap_copy(skb, rx_ring, ring_offset+4, pkt_size);
 #else
-			eth_copy_and_sum (skb, &rx_ring[ring_offset + 4], pkt_size, 0);
+			skb_copy_to_linear_data (skb, &rx_ring[ring_offset + 4], pkt_size);
 #endif
 			skb_put (skb, pkt_size);
 
diff --git a/drivers/net/a2065.c b/drivers/net/a2065.c
index 81d5a374042a..a45de6975bfe 100644
--- a/drivers/net/a2065.c
+++ b/drivers/net/a2065.c
@@ -322,9 +322,9 @@ static int lance_rx (struct net_device *dev)
 
 			skb_reserve (skb, 2);		/* 16 byte align */
 			skb_put (skb, len);		/* make room */
-			eth_copy_and_sum(skb,
+			skb_copy_to_linear_data(skb,
 					 (unsigned char *)&(ib->rx_buf [lp->rx_new][0]),
-					 len, 0);
+					 len);
 			skb->protocol = eth_type_trans (skb, dev);
 			netif_rx (skb);
 			dev->last_rx = jiffies;
diff --git a/drivers/net/ariadne.c b/drivers/net/ariadne.c
index a241ae7855a3..bc5a38a6705f 100644
--- a/drivers/net/ariadne.c
+++ b/drivers/net/ariadne.c
@@ -746,7 +746,7 @@ static int ariadne_rx(struct net_device *dev)
 
 	    skb_reserve(skb,2);		/* 16 byte align */
 	    skb_put(skb,pkt_len);	/* Make room */
-	    eth_copy_and_sum(skb, (char *)priv->rx_buff[entry], pkt_len,0);
+	    skb_copy_to_linear_data(skb, (char *)priv->rx_buff[entry], pkt_len);
 	    skb->protocol=eth_type_trans(skb,dev);
 #if 0
 	    printk(KERN_DEBUG "RX pkt type 0x%04x from ",
diff --git a/drivers/net/arm/ep93xx_eth.c b/drivers/net/arm/ep93xx_eth.c
index 2438c5bff237..f6ece1d43f6e 100644
--- a/drivers/net/arm/ep93xx_eth.c
+++ b/drivers/net/arm/ep93xx_eth.c
@@ -258,7 +258,7 @@ static int ep93xx_rx(struct net_device *dev, int *budget)
 			skb_reserve(skb, 2);
 			dma_sync_single(NULL, ep->descs->rdesc[entry].buf_addr,
 						length, DMA_FROM_DEVICE);
-			eth_copy_and_sum(skb, ep->rx_buf[entry], length, 0);
+			skb_copy_to_linear_data(skb, ep->rx_buf[entry], length);
 			skb_put(skb, length);
 			skb->protocol = eth_type_trans(skb, dev);
 
diff --git a/drivers/net/au1000_eth.c b/drivers/net/au1000_eth.c
index c27cfcef45fa..e86b3691765b 100644
--- a/drivers/net/au1000_eth.c
+++ b/drivers/net/au1000_eth.c
@@ -1205,8 +1205,8 @@ static int au1000_rx(struct net_device *dev)
 				continue;
 			}
 			skb_reserve(skb, 2);	/* 16 byte IP header align */
-			eth_copy_and_sum(skb,
-				(unsigned char *)pDB->vaddr, frmlen, 0);
+			skb_copy_to_linear_data(skb,
+				(unsigned char *)pDB->vaddr, frmlen);
 			skb_put(skb, frmlen);
 			skb->protocol = eth_type_trans(skb, dev);
 			netif_rx(skb);	/* pass the packet to upper layers */
diff --git a/drivers/net/dl2k.c b/drivers/net/dl2k.c
index 74ec64a1625d..a4ace071f1cb 100644
--- a/drivers/net/dl2k.c
+++ b/drivers/net/dl2k.c
@@ -866,9 +866,9 @@ receive_packet (struct net_device *dev)
 							    PCI_DMA_FROMDEVICE);
 				/* 16 byte align the IP header */
 				skb_reserve (skb, 2);
-				eth_copy_and_sum (skb,
+				skb_copy_to_linear_data (skb,
 						  np->rx_skbuff[entry]->data,
-						  pkt_len, 0);
+						  pkt_len);
 				skb_put (skb, pkt_len);
 				pci_dma_sync_single_for_device(np->pdev,
 				  			       desc->fraginfo &
diff --git a/drivers/net/eepro100.c b/drivers/net/eepro100.c
index 9800341956a2..9afa47edfc58 100644
--- a/drivers/net/eepro100.c
+++ b/drivers/net/eepro100.c
@@ -1801,7 +1801,7 @@ speedo_rx(struct net_device *dev)
 
 #if 1 || USE_IP_CSUM
 				/* Packet is in one chunk -- we can copy + cksum. */
-				eth_copy_and_sum(skb, sp->rx_skbuff[entry]->data, pkt_len, 0);
+				skb_copy_to_linear_data(skb, sp->rx_skbuff[entry]->data, pkt_len);
 				skb_put(skb, pkt_len);
 #else
 				skb_copy_from_linear_data(sp->rx_skbuff[entry],
diff --git a/drivers/net/epic100.c b/drivers/net/epic100.c
index 5e517946f46a..119778401e48 100644
--- a/drivers/net/epic100.c
+++ b/drivers/net/epic100.c
@@ -1201,7 +1201,7 @@ static int epic_rx(struct net_device *dev, int budget)
 							    ep->rx_ring[entry].bufaddr,
 							    ep->rx_buf_sz,
 							    PCI_DMA_FROMDEVICE);
-				eth_copy_and_sum(skb, ep->rx_skbuff[entry]->data, pkt_len, 0);
+				skb_copy_to_linear_data(skb, ep->rx_skbuff[entry]->data, pkt_len);
 				skb_put(skb, pkt_len);
 				pci_dma_sync_single_for_device(ep->pci_dev,
 							       ep->rx_ring[entry].bufaddr,
diff --git a/drivers/net/fealnx.c b/drivers/net/fealnx.c
index abe9b089c610..ff9f177d7157 100644
--- a/drivers/net/fealnx.c
+++ b/drivers/net/fealnx.c
@@ -1727,8 +1727,8 @@ static int netdev_rx(struct net_device *dev)
 				/* Call copy + cksum if available. */
 
 #if ! defined(__alpha__)
-				eth_copy_and_sum(skb,
-					np->cur_rx->skbuff->data, pkt_len, 0);
+				skb_copy_to_linear_data(skb,
+					np->cur_rx->skbuff->data, pkt_len);
 				skb_put(skb, pkt_len);
 #else
 				memcpy(skb_put(skb, pkt_len),
diff --git a/drivers/net/fec.c b/drivers/net/fec.c
index 255b09124e11..03023dd17829 100644
--- a/drivers/net/fec.c
+++ b/drivers/net/fec.c
@@ -648,7 +648,7 @@ while (!((status = bdp->cbd_sc) & BD_ENET_RX_EMPTY)) {
 		fep->stats.rx_dropped++;
 	} else {
 		skb_put(skb,pkt_len-4);	/* Make room */
-		eth_copy_and_sum(skb, data, pkt_len-4, 0);
+		skb_copy_to_linear_data(skb, data, pkt_len-4);
 		skb->protocol=eth_type_trans(skb,dev);
 		netif_rx(skb);
 	}
diff --git a/drivers/net/hamachi.c b/drivers/net/hamachi.c
index 2521b111b3a5..15254dc7876a 100644
--- a/drivers/net/hamachi.c
+++ b/drivers/net/hamachi.c
@@ -1575,8 +1575,8 @@ static int hamachi_rx(struct net_device *dev)
 							    PCI_DMA_FROMDEVICE);
 				/* Call copy + cksum if available. */
 #if 1 || USE_IP_COPYSUM
-				eth_copy_and_sum(skb,
-					hmp->rx_skbuff[entry]->data, pkt_len, 0);
+				skb_copy_to_linear_data(skb,
+					hmp->rx_skbuff[entry]->data, pkt_len);
 				skb_put(skb, pkt_len);
 #else
 				memcpy(skb_put(skb, pkt_len), hmp->rx_ring_dma
diff --git a/drivers/net/ixp2000/ixpdev.c b/drivers/net/ixp2000/ixpdev.c
index d5f694fc4a21..d9ce1aef148a 100644
--- a/drivers/net/ixp2000/ixpdev.c
+++ b/drivers/net/ixp2000/ixpdev.c
@@ -111,7 +111,7 @@ static int ixpdev_rx(struct net_device *dev, int *budget)
 		skb = dev_alloc_skb(desc->pkt_length + 2);
 		if (likely(skb != NULL)) {
 			skb_reserve(skb, 2);
-			eth_copy_and_sum(skb, buf, desc->pkt_length, 0);
+			skb_copy_to_linear_data(skb, buf, desc->pkt_length);
 			skb_put(skb, desc->pkt_length);
 			skb->protocol = eth_type_trans(skb, nds[desc->channel]);
 
diff --git a/drivers/net/lance.c b/drivers/net/lance.c
index 0fe96c85828b..a2f37e52b928 100644
--- a/drivers/net/lance.c
+++ b/drivers/net/lance.c
@@ -1186,9 +1186,9 @@ lance_rx(struct net_device *dev)
 				}
 				skb_reserve(skb,2);	/* 16 byte align */
 				skb_put(skb,pkt_len);	/* Make room */
-				eth_copy_and_sum(skb,
+				skb_copy_to_linear_data(skb,
 					(unsigned char *)isa_bus_to_virt((lp->rx_ring[entry].base & 0x00ffffff)),
-					pkt_len,0);
+					pkt_len);
 				skb->protocol=eth_type_trans(skb,dev);
 				netif_rx(skb);
 				dev->last_rx = jiffies;
diff --git a/drivers/net/natsemi.c b/drivers/net/natsemi.c
index 460a08718c69..3450051ae56b 100644
--- a/drivers/net/natsemi.c
+++ b/drivers/net/natsemi.c
@@ -2357,8 +2357,8 @@ static void netdev_rx(struct net_device *dev, int *work_done, int work_to_do)
 					np->rx_dma[entry],
 					buflen,
 					PCI_DMA_FROMDEVICE);
-				eth_copy_and_sum(skb,
-					np->rx_skbuff[entry]->data, pkt_len, 0);
+				skb_copy_to_linear_data(skb,
+					np->rx_skbuff[entry]->data, pkt_len);
 				skb_put(skb, pkt_len);
 				pci_dma_sync_single_for_device(np->pci_dev,
 					np->rx_dma[entry],
diff --git a/drivers/net/ni52.c b/drivers/net/ni52.c
index 8dbd6d1900b5..5e7999db2096 100644
--- a/drivers/net/ni52.c
+++ b/drivers/net/ni52.c
@@ -936,7 +936,7 @@ static void ni52_rcv_int(struct net_device *dev)
 					{
 						skb_reserve(skb,2);
 						skb_put(skb,totlen);
-						eth_copy_and_sum(skb,(char *) p->base+(unsigned long) rbd->buffer,totlen,0);
+						skb_copy_to_linear_data(skb,(char *) p->base+(unsigned long) rbd->buffer,totlen);
 						skb->protocol=eth_type_trans(skb,dev);
 						netif_rx(skb);
 						dev->last_rx = jiffies;
diff --git a/drivers/net/ni65.c b/drivers/net/ni65.c
index 3818edf0ac18..4ef5fe345191 100644
--- a/drivers/net/ni65.c
+++ b/drivers/net/ni65.c
@@ -1096,7 +1096,7 @@ static void ni65_recv_intr(struct net_device *dev,int csr0)
 #ifdef RCV_VIA_SKB
 				if( (unsigned long) (skb->data + R_BUF_SIZE) > 0x1000000) {
 					skb_put(skb,len);
-					eth_copy_and_sum(skb, (unsigned char *)(p->recv_skb[p->rmdnum]->data),len,0);
+					skb_copy_to_linear_data(skb, (unsigned char *)(p->recv_skb[p->rmdnum]->data),len);
 				}
 				else {
 					struct sk_buff *skb1 = p->recv_skb[p->rmdnum];
@@ -1108,7 +1108,7 @@ static void ni65_recv_intr(struct net_device *dev,int csr0)
 				}
 #else
 				skb_put(skb,len);
-				eth_copy_and_sum(skb, (unsigned char *) p->recvbounce[p->rmdnum],len,0);
+				skb_copy_to_linear_data(skb, (unsigned char *) p->recvbounce[p->rmdnum],len);
 #endif
 				p->stats.rx_packets++;
 				p->stats.rx_bytes += len;
diff --git a/drivers/net/pci-skeleton.c b/drivers/net/pci-skeleton.c
index df8998b4f37e..3cdbe118200b 100644
--- a/drivers/net/pci-skeleton.c
+++ b/drivers/net/pci-skeleton.c
@@ -1567,7 +1567,7 @@ static void netdrv_rx_interrupt (struct net_device *dev,
 		if (skb) {
 			skb_reserve (skb, 2);	/* 16 byte align the IP fields. */
 
-			eth_copy_and_sum (skb, &rx_ring[ring_offset + 4], pkt_size, 0);
+			skb_copy_to_linear_data (skb, &rx_ring[ring_offset + 4], pkt_size);
 			skb_put (skb, pkt_size);
 
 			skb->protocol = eth_type_trans (skb, dev);
diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c
index 9c171a7390e2..465485a3fbc6 100644
--- a/drivers/net/pcnet32.c
+++ b/drivers/net/pcnet32.c
@@ -1235,9 +1235,9 @@ static void pcnet32_rx_entry(struct net_device *dev,
 					    lp->rx_dma_addr[entry],
 					    pkt_len,
 					    PCI_DMA_FROMDEVICE);
-		eth_copy_and_sum(skb,
+		skb_copy_to_linear_data(skb,
 				 (unsigned char *)(lp->rx_skbuff[entry]->data),
-				 pkt_len, 0);
+				 pkt_len);
 		pci_dma_sync_single_for_device(lp->pci_dev,
 					       lp->rx_dma_addr[entry],
 					       pkt_len,
diff --git a/drivers/net/saa9730.c b/drivers/net/saa9730.c
index ad94358ece89..451486b32f23 100644
--- a/drivers/net/saa9730.c
+++ b/drivers/net/saa9730.c
@@ -690,9 +690,9 @@ static int lan_saa9730_rx(struct net_device *dev)
 				lp->stats.rx_packets++;
 				skb_reserve(skb, 2);	/* 16 byte align */
 				skb_put(skb, len);	/* make room */
-				eth_copy_and_sum(skb,
+				skb_copy_to_linear_data(skb,
 						 (unsigned char *) pData,
-						 len, 0);
+						 len);
 				skb->protocol = eth_type_trans(skb, dev);
 				netif_rx(skb);
 				dev->last_rx = jiffies;
diff --git a/drivers/net/sgiseeq.c b/drivers/net/sgiseeq.c
index 2106becf6990..384b4685e977 100644
--- a/drivers/net/sgiseeq.c
+++ b/drivers/net/sgiseeq.c
@@ -320,7 +320,7 @@ static inline void sgiseeq_rx(struct net_device *dev, struct sgiseeq_private *sp
 				skb_put(skb, len);
 
 				/* Copy out of kseg1 to avoid silly cache flush. */
-				eth_copy_and_sum(skb, pkt_pointer + 2, len, 0);
+				skb_copy_to_linear_data(skb, pkt_pointer + 2, len);
 				skb->protocol = eth_type_trans(skb, dev);
 
 				/* We don't want to receive our own packets */
diff --git a/drivers/net/sis190.c b/drivers/net/sis190.c
index bc8de48da313..ec2ad9f0efa2 100644
--- a/drivers/net/sis190.c
+++ b/drivers/net/sis190.c
@@ -548,7 +548,7 @@ static inline int sis190_try_rx_copy(struct sk_buff **sk_buff, int pkt_size,
 		skb = dev_alloc_skb(pkt_size + NET_IP_ALIGN);
 		if (skb) {
 			skb_reserve(skb, NET_IP_ALIGN);
-			eth_copy_and_sum(skb, sk_buff[0]->data, pkt_size, 0);
+			skb_copy_to_linear_data(skb, sk_buff[0]->data, pkt_size);
 			*sk_buff = skb;
 			sis190_give_to_asic(desc, rx_buf_sz);
 			ret = 0;
diff --git a/drivers/net/starfire.c b/drivers/net/starfire.c
index 786d4b9c07ec..f2e101967204 100644
--- a/drivers/net/starfire.c
+++ b/drivers/net/starfire.c
@@ -1456,7 +1456,7 @@ static int __netdev_rx(struct net_device *dev, int *quota)
 			pci_dma_sync_single_for_cpu(np->pci_dev,
 						    np->rx_info[entry].mapping,
 						    pkt_len, PCI_DMA_FROMDEVICE);
-			eth_copy_and_sum(skb, np->rx_info[entry].skb->data, pkt_len, 0);
+			skb_copy_to_linear_data(skb, np->rx_info[entry].skb->data, pkt_len);
 			pci_dma_sync_single_for_device(np->pci_dev,
 						       np->rx_info[entry].mapping,
 						       pkt_len, PCI_DMA_FROMDEVICE);
diff --git a/drivers/net/sun3_82586.c b/drivers/net/sun3_82586.c
index a123ea87893b..b77ab6e8fd35 100644
--- a/drivers/net/sun3_82586.c
+++ b/drivers/net/sun3_82586.c
@@ -777,7 +777,7 @@ static void sun3_82586_rcv_int(struct net_device *dev)
 					{
 						skb_reserve(skb,2);
 						skb_put(skb,totlen);
-						eth_copy_and_sum(skb,(char *) p->base+swab32((unsigned long) rbd->buffer),totlen,0);
+						skb_copy_to_linear_data(skb,(char *) p->base+swab32((unsigned long) rbd->buffer),totlen);
 						skb->protocol=eth_type_trans(skb,dev);
 						netif_rx(skb);
 						p->stats.rx_packets++;
diff --git a/drivers/net/sun3lance.c b/drivers/net/sun3lance.c
index 791e081fdc15..f1548c033327 100644
--- a/drivers/net/sun3lance.c
+++ b/drivers/net/sun3lance.c
@@ -853,10 +853,9 @@ static int lance_rx( struct net_device *dev )
 
 				skb_reserve( skb, 2 );	/* 16 byte align */
 				skb_put( skb, pkt_len );	/* Make room */
-//			        skb_copy_to_linear_data(skb, PKTBUF_ADDR(head), pkt_len);
-				eth_copy_and_sum(skb,
+				skb_copy_to_linear_data(skb,
 						 PKTBUF_ADDR(head),
-						 pkt_len, 0);
+						 pkt_len);
 
 				skb->protocol = eth_type_trans( skb, dev );
 				netif_rx( skb );
diff --git a/drivers/net/sunbmac.c b/drivers/net/sunbmac.c
index 2ad8d58dee3b..b3e0158def4f 100644
--- a/drivers/net/sunbmac.c
+++ b/drivers/net/sunbmac.c
@@ -860,7 +860,7 @@ static void bigmac_rx(struct bigmac *bp)
 			sbus_dma_sync_single_for_cpu(bp->bigmac_sdev,
 						     this->rx_addr, len,
 						     SBUS_DMA_FROMDEVICE);
-			eth_copy_and_sum(copy_skb, (unsigned char *)skb->data, len, 0);
+			skb_copy_to_linear_data(copy_skb, (unsigned char *)skb->data, len);
 			sbus_dma_sync_single_for_device(bp->bigmac_sdev,
 							this->rx_addr, len,
 							SBUS_DMA_FROMDEVICE);
diff --git a/drivers/net/sundance.c b/drivers/net/sundance.c
index e1f912d04043..c8ba534c17bf 100644
--- a/drivers/net/sundance.c
+++ b/drivers/net/sundance.c
@@ -1313,7 +1313,7 @@ static void rx_poll(unsigned long data)
 							    np->rx_buf_sz,
 							    PCI_DMA_FROMDEVICE);
 
-				eth_copy_and_sum(skb, np->rx_skbuff[entry]->data, pkt_len, 0);
+				skb_copy_to_linear_data(skb, np->rx_skbuff[entry]->data, pkt_len);
 				pci_dma_sync_single_for_device(np->pci_dev,
 							       desc->frag[0].addr,
 							       np->rx_buf_sz,
diff --git a/drivers/net/sunlance.c b/drivers/net/sunlance.c
index 42722530ab24..053b7cb0d944 100644
--- a/drivers/net/sunlance.c
+++ b/drivers/net/sunlance.c
@@ -549,9 +549,9 @@ static void lance_rx_dvma(struct net_device *dev)
 
 			skb_reserve(skb, 2);		/* 16 byte align */
 			skb_put(skb, len);		/* make room */
-			eth_copy_and_sum(skb,
+			skb_copy_to_linear_data(skb,
 					 (unsigned char *)&(ib->rx_buf [entry][0]),
-					 len, 0);
+					 len);
 			skb->protocol = eth_type_trans(skb, dev);
 			netif_rx(skb);
 			dev->last_rx = jiffies;
diff --git a/drivers/net/sunqe.c b/drivers/net/sunqe.c
index fa70e0b78af7..1b65ae8a1c7c 100644
--- a/drivers/net/sunqe.c
+++ b/drivers/net/sunqe.c
@@ -439,8 +439,8 @@ static void qe_rx(struct sunqe *qep)
 			} else {
 				skb_reserve(skb, 2);
 				skb_put(skb, len);
-				eth_copy_and_sum(skb, (unsigned char *) this_qbuf,
-						 len, 0);
+				skb_copy_to_linear_data(skb, (unsigned char *) this_qbuf,
+						 len);
 				skb->protocol = eth_type_trans(skb, qep->dev);
 				netif_rx(skb);
 				qep->dev->last_rx = jiffies;
diff --git a/drivers/net/tulip/interrupt.c b/drivers/net/tulip/interrupt.c
index ea896777bcaf..53efd6694e75 100644
--- a/drivers/net/tulip/interrupt.c
+++ b/drivers/net/tulip/interrupt.c
@@ -197,8 +197,8 @@ int tulip_poll(struct net_device *dev, int *budget)
 								   tp->rx_buffers[entry].mapping,
 								   pkt_len, PCI_DMA_FROMDEVICE);
 #if ! defined(__alpha__)
-                                       eth_copy_and_sum(skb, tp->rx_buffers[entry].skb->data,
-                                                        pkt_len, 0);
+                                       skb_copy_to_linear_data(skb, tp->rx_buffers[entry].skb->data,
+                                                        pkt_len);
                                        skb_put(skb, pkt_len);
 #else
                                        memcpy(skb_put(skb, pkt_len),
@@ -420,8 +420,8 @@ static int tulip_rx(struct net_device *dev)
 							    tp->rx_buffers[entry].mapping,
 							    pkt_len, PCI_DMA_FROMDEVICE);
 #if ! defined(__alpha__)
-				eth_copy_and_sum(skb, tp->rx_buffers[entry].skb->data,
-						 pkt_len, 0);
+				skb_copy_to_linear_data(skb, tp->rx_buffers[entry].skb->data,
+						 pkt_len);
 				skb_put(skb, pkt_len);
 #else
 				memcpy(skb_put(skb, pkt_len),
diff --git a/drivers/net/tulip/winbond-840.c b/drivers/net/tulip/winbond-840.c
index 38f3b99716b8..5824f6a35495 100644
--- a/drivers/net/tulip/winbond-840.c
+++ b/drivers/net/tulip/winbond-840.c
@@ -1232,7 +1232,7 @@ static int netdev_rx(struct net_device *dev)
 				pci_dma_sync_single_for_cpu(np->pci_dev,np->rx_addr[entry],
 							    np->rx_skbuff[entry]->len,
 							    PCI_DMA_FROMDEVICE);
-				eth_copy_and_sum(skb, np->rx_skbuff[entry]->data, pkt_len, 0);
+				skb_copy_to_linear_data(skb, np->rx_skbuff[entry]->data, pkt_len);
 				skb_put(skb, pkt_len);
 				pci_dma_sync_single_for_device(np->pci_dev,np->rx_addr[entry],
 							       np->rx_skbuff[entry]->len,
diff --git a/drivers/net/tulip/xircom_cb.c b/drivers/net/tulip/xircom_cb.c
index 2470b1ee33c0..37e35cd277a1 100644
--- a/drivers/net/tulip/xircom_cb.c
+++ b/drivers/net/tulip/xircom_cb.c
@@ -1208,7 +1208,7 @@ static void investigate_read_descriptor(struct net_device *dev,struct xircom_pri
 				goto out;
 			}
 			skb_reserve(skb, 2);
-			eth_copy_and_sum(skb, (unsigned char*)&card->rx_buffer[bufferoffset / 4], pkt_len, 0);
+			skb_copy_to_linear_data(skb, (unsigned char*)&card->rx_buffer[bufferoffset / 4], pkt_len);
 			skb_put(skb, pkt_len);
 			skb->protocol = eth_type_trans(skb, dev);
 			netif_rx(skb);
diff --git a/drivers/net/tulip/xircom_tulip_cb.c b/drivers/net/tulip/xircom_tulip_cb.c
index f64172927377..f984fbde8b23 100644
--- a/drivers/net/tulip/xircom_tulip_cb.c
+++ b/drivers/net/tulip/xircom_tulip_cb.c
@@ -1242,8 +1242,8 @@ xircom_rx(struct net_device *dev)
 				&& (skb = dev_alloc_skb(pkt_len + 2)) != NULL) {
 				skb_reserve(skb, 2);	/* 16 byte align the IP header */
 #if ! defined(__alpha__)
-				eth_copy_and_sum(skb, bus_to_virt(tp->rx_ring[entry].buffer1),
-								 pkt_len, 0);
+				skb_copy_to_linear_data(skb, bus_to_virt(tp->rx_ring[entry].buffer1),
+								 pkt_len);
 				skb_put(skb, pkt_len);
 #else
 				memcpy(skb_put(skb, pkt_len),
diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c
index 15b2fb8aa492..df524548d531 100644
--- a/drivers/net/typhoon.c
+++ b/drivers/net/typhoon.c
@@ -1703,7 +1703,7 @@ typhoon_rx(struct typhoon *tp, struct basic_ring *rxRing, volatile u32 * ready,
 			pci_dma_sync_single_for_cpu(tp->pdev, dma_addr,
 						    PKT_BUF_SZ,
 						    PCI_DMA_FROMDEVICE);
-			eth_copy_and_sum(new_skb, skb->data, pkt_len, 0);
+			skb_copy_to_linear_data(new_skb, skb->data, pkt_len);
 			pci_dma_sync_single_for_device(tp->pdev, dma_addr,
 						       PKT_BUF_SZ,
 						       PCI_DMA_FROMDEVICE);
diff --git a/drivers/net/usb/catc.c b/drivers/net/usb/catc.c
index 86e90c59d551..76752d84a30f 100644
--- a/drivers/net/usb/catc.c
+++ b/drivers/net/usb/catc.c
@@ -255,7 +255,7 @@ static void catc_rx_done(struct urb *urb)
 		if (!(skb = dev_alloc_skb(pkt_len)))
 			return;
 
-		eth_copy_and_sum(skb, pkt_start + pkt_offset, pkt_len, 0);
+		skb_copy_to_linear_data(skb, pkt_start + pkt_offset, pkt_len);
 		skb_put(skb, pkt_len);
 
 		skb->protocol = eth_type_trans(skb, catc->netdev);
diff --git a/drivers/net/usb/kaweth.c b/drivers/net/usb/kaweth.c
index 60d29440f316..524dc5f5e46d 100644
--- a/drivers/net/usb/kaweth.c
+++ b/drivers/net/usb/kaweth.c
@@ -635,7 +635,7 @@ static void kaweth_usb_receive(struct urb *urb)
 
 		skb_reserve(skb, 2);    /* Align IP on 16 byte boundaries */
 
-		eth_copy_and_sum(skb, kaweth->rx_buf + 2, pkt_len, 0);
+		skb_copy_to_linear_data(skb, kaweth->rx_buf + 2, pkt_len);
 
 		skb_put(skb, pkt_len);
 
diff --git a/drivers/net/via-rhine.c b/drivers/net/via-rhine.c
index adea290a9d5e..565f6cc185ce 100644
--- a/drivers/net/via-rhine.c
+++ b/drivers/net/via-rhine.c
@@ -1492,9 +1492,9 @@ static int rhine_rx(struct net_device *dev, int limit)
 							    rp->rx_buf_sz,
 							    PCI_DMA_FROMDEVICE);
 
-				eth_copy_and_sum(skb,
+				skb_copy_to_linear_data(skb,
 						 rp->rx_skbuff[entry]->data,
-						 pkt_len, 0);
+						 pkt_len);
 				skb_put(skb, pkt_len);
 				pci_dma_sync_single_for_device(rp->pdev,
 							       rp->rx_skbuff_dma[entry],
diff --git a/drivers/net/wireless/wl3501_cs.c b/drivers/net/wireless/wl3501_cs.c
index ce9230b2f630..c8b5c2271938 100644
--- a/drivers/net/wireless/wl3501_cs.c
+++ b/drivers/net/wireless/wl3501_cs.c
@@ -1011,7 +1011,7 @@ static inline void wl3501_md_ind_interrupt(struct net_device *dev,
 	} else {
 		skb->dev = dev;
 		skb_reserve(skb, 2); /* IP headers on 16 bytes boundaries */
-		eth_copy_and_sum(skb, (unsigned char *)&sig.daddr, 12, 0);
+		skb_copy_to_linear_data(skb, (unsigned char *)&sig.daddr, 12);
 		wl3501_receive(this, skb->data, pkt_len);
 		skb_put(skb, pkt_len);
 		skb->protocol	= eth_type_trans(skb, dev);
diff --git a/drivers/net/yellowfin.c b/drivers/net/yellowfin.c
index f2a90a7fa2d6..870c5393c21a 100644
--- a/drivers/net/yellowfin.c
+++ b/drivers/net/yellowfin.c
@@ -1137,7 +1137,7 @@ static int yellowfin_rx(struct net_device *dev)
 				if (skb == NULL)
 					break;
 				skb_reserve(skb, 2);	/* 16 byte align the IP header */
-				eth_copy_and_sum(skb, rx_skb->data, pkt_len, 0);
+				skb_copy_to_linear_data(skb, rx_skb->data, pkt_len);
 				skb_put(skb, pkt_len);
 				pci_dma_sync_single_for_device(yp->pci_dev, desc->addr,
 											   yp->rx_buf_sz,
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 071c67abed86..f48eb89efd0f 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -40,12 +40,6 @@ extern int		eth_header_cache(struct neighbour *neigh,
 					 struct hh_cache *hh);
 
 extern struct net_device *alloc_etherdev(int sizeof_priv);
-static inline void eth_copy_and_sum (struct sk_buff *dest, 
-				     const unsigned char *src, 
-				     int len, int base)
-{
-	memcpy (dest->data, src, len);
-}
 
 /**
  * is_zero_ether_addr - Determine if give Ethernet address is all zeros.
-- 
cgit v1.3-8-gc7d7


From 6472ce6096bf27d85a1f2580964a36f290bd60a9 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 13 Jun 2007 12:03:21 -0700
Subject: [NET]: Mark struct net_device * argument to netdev_priv const

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3a70f553b28f..94cc77cd3aa3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -546,7 +546,7 @@ struct net_device
 #define	NETDEV_ALIGN		32
 #define	NETDEV_ALIGN_CONST	(NETDEV_ALIGN - 1)
 
-static inline void *netdev_priv(struct net_device *dev)
+static inline void *netdev_priv(const struct net_device *dev)
 {
 	return (char *)dev + ((sizeof(struct net_device)
 					+ NETDEV_ALIGN_CONST)
-- 
cgit v1.3-8-gc7d7


From 38f7b870d4a6a5d3ec21557e849620cb7d032965 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 13 Jun 2007 12:03:51 -0700
Subject: [RTNETLINK]: Link creation API

Add rtnetlink API for creating, changing and deleting software devices.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_link.h   |  13 ++
 include/linux/netdevice.h |   3 +
 include/net/rtnetlink.h   |  58 ++++++++
 net/core/rtnetlink.c      | 342 +++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 409 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 604c2434f71c..3144babd2357 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -76,6 +76,8 @@ enum
 #define IFLA_WEIGHT IFLA_WEIGHT
 	IFLA_OPERSTATE,
 	IFLA_LINKMODE,
+	IFLA_LINKINFO,
+#define IFLA_LINKINFO IFLA_LINKINFO
 	__IFLA_MAX
 };
 
@@ -140,4 +142,15 @@ struct ifla_cacheinfo
 	__u32	retrans_time;
 };
 
+enum
+{
+	IFLA_INFO_UNSPEC,
+	IFLA_INFO_KIND,
+	IFLA_INFO_DATA,
+	IFLA_INFO_XSTATS,
+	__IFLA_INFO_MAX,
+};
+
+#define IFLA_INFO_MAX	(__IFLA_INFO_MAX - 1)
+
 #endif /* _LINUX_IF_LINK_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 94cc77cd3aa3..e7913ee5581c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -540,6 +540,9 @@ struct net_device
 	struct device		dev;
 	/* space for optional statistics and wireless sysfs groups */
 	struct attribute_group  *sysfs_groups[3];
+
+	/* rtnetlink link ops */
+	const struct rtnl_link_ops *rtnl_link_ops;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 3b3d4745618d..3861c05cdf0f 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -22,4 +22,62 @@ static inline int rtnl_msg_family(struct nlmsghdr *nlh)
 		return AF_UNSPEC;
 }
 
+/**
+ *	struct rtnl_link_ops - rtnetlink link operations
+ *
+ *	@list: Used internally
+ *	@kind: Identifier
+ *	@maxtype: Highest device specific netlink attribute number
+ *	@policy: Netlink policy for device specific attribute validation
+ *	@validate: Optional validation function for netlink/changelink parameters
+ *	@priv_size: sizeof net_device private space
+ *	@setup: net_device setup function
+ *	@newlink: Function for configuring and registering a new device
+ *	@changelink: Function for changing parameters of an existing device
+ *	@dellink: Function to remove a device
+ *	@get_size: Function to calculate required room for dumping device
+ *		   specific netlink attributes
+ *	@fill_info: Function to dump device specific netlink attributes
+ *	@get_xstats_size: Function to calculate required room for dumping devic
+ *			  specific statistics
+ *	@fill_xstats: Function to dump device specific statistics
+ */
+struct rtnl_link_ops {
+	struct list_head	list;
+
+	const char		*kind;
+
+	size_t			priv_size;
+	void			(*setup)(struct net_device *dev);
+
+	int			maxtype;
+	const struct nla_policy	*policy;
+	int			(*validate)(struct nlattr *tb[],
+					    struct nlattr *data[]);
+
+	int			(*newlink)(struct net_device *dev,
+					   struct nlattr *tb[],
+					   struct nlattr *data[]);
+	int			(*changelink)(struct net_device *dev,
+					      struct nlattr *tb[],
+					      struct nlattr *data[]);
+	void			(*dellink)(struct net_device *dev);
+
+	size_t			(*get_size)(const struct net_device *dev);
+	int			(*fill_info)(struct sk_buff *skb,
+					     const struct net_device *dev);
+
+	size_t			(*get_xstats_size)(const struct net_device *dev);
+	int			(*fill_xstats)(struct sk_buff *skb,
+					       const struct net_device *dev);
+};
+
+extern int	__rtnl_link_register(struct rtnl_link_ops *ops);
+extern void	__rtnl_link_unregister(struct rtnl_link_ops *ops);
+
+extern int	rtnl_link_register(struct rtnl_link_ops *ops);
+extern void	rtnl_link_unregister(struct rtnl_link_ops *ops);
+
+#define MODULE_ALIAS_RTNL_LINK(kind) MODULE_ALIAS("rtnl-link-" kind)
+
 #endif
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 25ca219154e0..06c0c5afabf0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -243,6 +243,143 @@ void rtnl_unregister_all(int protocol)
 
 EXPORT_SYMBOL_GPL(rtnl_unregister_all);
 
+static LIST_HEAD(link_ops);
+
+/**
+ * __rtnl_link_register - Register rtnl_link_ops with rtnetlink.
+ * @ops: struct rtnl_link_ops * to register
+ *
+ * The caller must hold the rtnl_mutex. This function should be used
+ * by drivers that create devices during module initialization. It
+ * must be called before registering the devices.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int __rtnl_link_register(struct rtnl_link_ops *ops)
+{
+	list_add_tail(&ops->list, &link_ops);
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(__rtnl_link_register);
+
+/**
+ * rtnl_link_register - Register rtnl_link_ops with rtnetlink.
+ * @ops: struct rtnl_link_ops * to register
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int rtnl_link_register(struct rtnl_link_ops *ops)
+{
+	int err;
+
+	rtnl_lock();
+	err = __rtnl_link_register(ops);
+	rtnl_unlock();
+	return err;
+}
+
+EXPORT_SYMBOL_GPL(rtnl_link_register);
+
+/**
+ * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
+ * @ops: struct rtnl_link_ops * to unregister
+ *
+ * The caller must hold the rtnl_mutex. This function should be used
+ * by drivers that unregister devices during module unloading. It must
+ * be called after unregistering the devices.
+ */
+void __rtnl_link_unregister(struct rtnl_link_ops *ops)
+{
+	list_del(&ops->list);
+}
+
+EXPORT_SYMBOL_GPL(__rtnl_link_unregister);
+
+/**
+ * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
+ * @ops: struct rtnl_link_ops * to unregister
+ */
+void rtnl_link_unregister(struct rtnl_link_ops *ops)
+{
+	rtnl_lock();
+	__rtnl_link_unregister(ops);
+	rtnl_unlock();
+}
+
+EXPORT_SYMBOL_GPL(rtnl_link_unregister);
+
+static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
+{
+	const struct rtnl_link_ops *ops;
+
+	list_for_each_entry(ops, &link_ops, list) {
+		if (!strcmp(ops->kind, kind))
+			return ops;
+	}
+	return NULL;
+}
+
+static size_t rtnl_link_get_size(const struct net_device *dev)
+{
+	const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+	size_t size;
+
+	if (!ops)
+		return 0;
+
+	size = nlmsg_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */
+	       nlmsg_total_size(strlen(ops->kind) + 1);	 /* IFLA_INFO_KIND */
+
+	if (ops->get_size)
+		/* IFLA_INFO_DATA + nested data */
+		size += nlmsg_total_size(sizeof(struct nlattr)) +
+			ops->get_size(dev);
+
+	if (ops->get_xstats_size)
+		size += ops->get_xstats_size(dev);	/* IFLA_INFO_XSTATS */
+
+	return size;
+}
+
+static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev)
+{
+	const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+	struct nlattr *linkinfo, *data;
+	int err = -EMSGSIZE;
+
+	linkinfo = nla_nest_start(skb, IFLA_LINKINFO);
+	if (linkinfo == NULL)
+		goto out;
+
+	if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0)
+		goto err_cancel_link;
+	if (ops->fill_xstats) {
+		err = ops->fill_xstats(skb, dev);
+		if (err < 0)
+			goto err_cancel_link;
+	}
+	if (ops->fill_info) {
+		data = nla_nest_start(skb, IFLA_INFO_DATA);
+		if (data == NULL)
+			goto err_cancel_link;
+		err = ops->fill_info(skb, dev);
+		if (err < 0)
+			goto err_cancel_data;
+		nla_nest_end(skb, data);
+	}
+
+	nla_nest_end(skb, linkinfo);
+	return 0;
+
+err_cancel_data:
+	nla_nest_cancel(skb, data);
+err_cancel_link:
+	nla_nest_cancel(skb, linkinfo);
+out:
+	return err;
+}
+
 static const int rtm_min[RTM_NR_FAMILIES] =
 {
 	[RTM_FAM(RTM_NEWLINK)]      = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
@@ -437,7 +574,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
 	a->tx_compressed = b->tx_compressed;
 };
 
-static inline size_t if_nlmsg_size(void)
+static inline size_t if_nlmsg_size(const struct net_device *dev)
 {
 	return NLMSG_ALIGN(sizeof(struct ifinfomsg))
 	       + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
@@ -452,7 +589,8 @@ static inline size_t if_nlmsg_size(void)
 	       + nla_total_size(4) /* IFLA_LINK */
 	       + nla_total_size(4) /* IFLA_MASTER */
 	       + nla_total_size(1) /* IFLA_OPERSTATE */
-	       + nla_total_size(1); /* IFLA_LINKMODE */
+	       + nla_total_size(1) /* IFLA_LINKMODE */
+	       + rtnl_link_get_size(dev); /* IFLA_LINKINFO */
 }
 
 static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
@@ -522,6 +660,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 
+	if (dev->rtnl_link_ops) {
+		if (rtnl_link_fill(skb, dev) < 0)
+			goto nla_put_failure;
+	}
+
 	return nlmsg_end(skb, nlh);
 
 nla_put_failure:
@@ -553,6 +696,8 @@ cont:
 
 static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_IFNAME]		= { .type = NLA_STRING, .len = IFNAMSIZ-1 },
+	[IFLA_ADDRESS]		= { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+	[IFLA_BROADCAST]	= { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
 	[IFLA_MAP]		= { .len = sizeof(struct rtnl_link_ifmap) },
 	[IFLA_MTU]		= { .type = NLA_U32 },
 	[IFLA_TXQLEN]		= { .type = NLA_U32 },
@@ -561,10 +706,15 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_LINKMODE]		= { .type = NLA_U8 },
 };
 
+static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
+	[IFLA_INFO_KIND]	= { .type = NLA_STRING },
+	[IFLA_INFO_DATA]	= { .type = NLA_NESTED },
+};
+
 static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
-		      struct nlattr **tb, char *ifname)
+		      struct nlattr **tb, char *ifname, int modified)
 {
-	int modified = 0, send_addr_notify = 0;
+	int send_addr_notify = 0;
 	int err;
 
 	if (tb[IFLA_MAP]) {
@@ -729,13 +879,189 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	    nla_len(tb[IFLA_BROADCAST]) < dev->addr_len)
 		goto errout_dev;
 
-	err = do_setlink(dev, ifm, tb, ifname);
+	err = do_setlink(dev, ifm, tb, ifname, 0);
 errout_dev:
 	dev_put(dev);
 errout:
 	return err;
 }
 
+static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	const struct rtnl_link_ops *ops;
+	struct net_device *dev;
+	struct ifinfomsg *ifm;
+	char ifname[IFNAMSIZ];
+	struct nlattr *tb[IFLA_MAX+1];
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[IFLA_IFNAME])
+		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+
+	ifm = nlmsg_data(nlh);
+	if (ifm->ifi_index > 0)
+		dev = __dev_get_by_index(ifm->ifi_index);
+	else if (tb[IFLA_IFNAME])
+		dev = __dev_get_by_name(ifname);
+	else
+		return -EINVAL;
+
+	if (!dev)
+		return -ENODEV;
+
+	ops = dev->rtnl_link_ops;
+	if (!ops)
+		return -EOPNOTSUPP;
+
+	ops->dellink(dev);
+	return 0;
+}
+
+static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	const struct rtnl_link_ops *ops;
+	struct net_device *dev;
+	struct ifinfomsg *ifm;
+	char kind[MODULE_NAME_LEN];
+	char ifname[IFNAMSIZ];
+	struct nlattr *tb[IFLA_MAX+1];
+	struct nlattr *linkinfo[IFLA_INFO_MAX+1];
+	int err;
+
+replay:
+	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[IFLA_IFNAME])
+		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+	else
+		ifname[0] = '\0';
+
+	ifm = nlmsg_data(nlh);
+	if (ifm->ifi_index > 0)
+		dev = __dev_get_by_index(ifm->ifi_index);
+	else if (ifname[0])
+		dev = __dev_get_by_name(ifname);
+	else
+		dev = NULL;
+
+	if (tb[IFLA_LINKINFO]) {
+		err = nla_parse_nested(linkinfo, IFLA_INFO_MAX,
+				       tb[IFLA_LINKINFO], ifla_info_policy);
+		if (err < 0)
+			return err;
+	} else
+		memset(linkinfo, 0, sizeof(linkinfo));
+
+	if (linkinfo[IFLA_INFO_KIND]) {
+		nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
+		ops = rtnl_link_ops_get(kind);
+	} else {
+		kind[0] = '\0';
+		ops = NULL;
+	}
+
+	if (1) {
+		struct nlattr *attr[ops ? ops->maxtype + 1 : 0], **data = NULL;
+
+		if (ops) {
+			if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
+				err = nla_parse_nested(attr, ops->maxtype,
+						       linkinfo[IFLA_INFO_DATA],
+						       ops->policy);
+				if (err < 0)
+					return err;
+				data = attr;
+			}
+			if (ops->validate) {
+				err = ops->validate(tb, data);
+				if (err < 0)
+					return err;
+			}
+		}
+
+		if (dev) {
+			int modified = 0;
+
+			if (nlh->nlmsg_flags & NLM_F_EXCL)
+				return -EEXIST;
+			if (nlh->nlmsg_flags & NLM_F_REPLACE)
+				return -EOPNOTSUPP;
+
+			if (linkinfo[IFLA_INFO_DATA]) {
+				if (!ops || ops != dev->rtnl_link_ops ||
+				    !ops->changelink)
+					return -EOPNOTSUPP;
+
+				err = ops->changelink(dev, tb, data);
+				if (err < 0)
+					return err;
+				modified = 1;
+			}
+
+			return do_setlink(dev, ifm, tb, ifname, modified);
+		}
+
+		if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+			return -ENODEV;
+
+		if (ifm->ifi_index || ifm->ifi_flags || ifm->ifi_change)
+			return -EOPNOTSUPP;
+		if (tb[IFLA_ADDRESS] || tb[IFLA_BROADCAST] || tb[IFLA_MAP] ||
+		    tb[IFLA_MASTER] || tb[IFLA_PROTINFO])
+			return -EOPNOTSUPP;
+
+		if (!ops) {
+#ifdef CONFIG_KMOD
+			if (kind[0]) {
+				__rtnl_unlock();
+				request_module("rtnl-link-%s", kind);
+				rtnl_lock();
+				ops = rtnl_link_ops_get(kind);
+				if (ops)
+					goto replay;
+			}
+#endif
+			return -EOPNOTSUPP;
+		}
+
+		if (!ifname[0])
+			snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
+		dev = alloc_netdev(ops->priv_size, ifname, ops->setup);
+		if (!dev)
+			return -ENOMEM;
+
+		if (strchr(dev->name, '%')) {
+			err = dev_alloc_name(dev, dev->name);
+			if (err < 0)
+				goto err_free;
+		}
+		dev->rtnl_link_ops = ops;
+
+		if (tb[IFLA_MTU])
+			dev->mtu = nla_get_u32(tb[IFLA_MTU]);
+		if (tb[IFLA_TXQLEN])
+			dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
+		if (tb[IFLA_WEIGHT])
+			dev->weight = nla_get_u32(tb[IFLA_WEIGHT]);
+		if (tb[IFLA_OPERSTATE])
+			set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
+		if (tb[IFLA_LINKMODE])
+			dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
+
+		err = ops->newlink(dev, tb, data);
+err_free:
+		if (err < 0)
+			free_netdev(dev);
+		return err;
+	}
+}
+
 static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 {
 	struct ifinfomsg *ifm;
@@ -756,7 +1082,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	} else
 		return -EINVAL;
 
-	nskb = nlmsg_new(if_nlmsg_size(), GFP_KERNEL);
+	nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL);
 	if (nskb == NULL) {
 		err = -ENOBUFS;
 		goto errout;
@@ -806,7 +1132,7 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
 	struct sk_buff *skb;
 	int err = -ENOBUFS;
 
-	skb = nlmsg_new(if_nlmsg_size(), GFP_KERNEL);
+	skb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL);
 	if (skb == NULL)
 		goto errout;
 
@@ -961,6 +1287,8 @@ void __init rtnetlink_init(void)
 
 	rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, rtnl_dump_ifinfo);
 	rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL);
+	rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL);
 
 	rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all);
 	rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all);
-- 
cgit v1.3-8-gc7d7


From 734423cf38021966a5d3bd5f5c6aaecaf32fb4ac Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 13 Jun 2007 12:07:07 -0700
Subject: [VLAN]: Use 32 bit value for skb->priority mapping

skb->priority has only 32 bits and even VLAN uses 32 bit values in its API.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 4 ++--
 net/8021q/vlanproc.c    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 81e9bc93569b..aeddb49193f9 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -99,7 +99,7 @@ static inline void vlan_group_set_device(struct vlan_group *vg, int vlan_id,
 }
 
 struct vlan_priority_tci_mapping {
-	unsigned long priority;
+	u32 priority;
 	unsigned short vlan_qos; /* This should be shifted when first set, so we only do it
 				  * at provisioning time.
 				  * ((skb->priority << 13) & 0xE000)
@@ -112,7 +112,7 @@ struct vlan_dev_info {
 	/** This will be the mapping that correlates skb->priority to
 	 * 3 bits of VLAN QOS tags...
 	 */
-	unsigned long ingress_priority_map[8];
+	u32 ingress_priority_map[8];
 	struct vlan_priority_tci_mapping *egress_priority_map[16]; /* hash table */
 
 	unsigned short vlan_id;        /*  The VLAN Identifier for this interface. */
diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
index d216a64421cd..8693b21b3caa 100644
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -342,7 +342,7 @@ static int vlandev_seq_show(struct seq_file *seq, void *offset)
 	seq_printf(seq, "Device: %s", dev_info->real_dev->name);
 	/* now show all PRIORITY mappings relating to this VLAN */
 	seq_printf(seq,
-		       "\nINGRESS priority mappings: 0:%lu  1:%lu  2:%lu  3:%lu  4:%lu  5:%lu  6:%lu 7:%lu\n",
+		       "\nINGRESS priority mappings: 0:%u  1:%u  2:%u  3:%u  4:%u  5:%u  6:%u 7:%u\n",
 		       dev_info->ingress_priority_map[0],
 		       dev_info->ingress_priority_map[1],
 		       dev_info->ingress_priority_map[2],
@@ -357,7 +357,7 @@ static int vlandev_seq_show(struct seq_file *seq, void *offset)
 		const struct vlan_priority_tci_mapping *mp
 			= dev_info->egress_priority_map[i];
 		while (mp) {
-			seq_printf(seq, "%lu:%hu ",
+			seq_printf(seq, "%u:%hu ",
 				   mp->priority, ((mp->vlan_qos >> 13) & 0x7));
 			mp = mp->next;
 		}
-- 
cgit v1.3-8-gc7d7


From b020cb488586f982f40eb257a32e92a4de710d65 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 13 Jun 2007 12:07:22 -0700
Subject: [VLAN]: Keep track of number of QoS mappings

Keep track of the number of configured ingress/egress QoS mappings to
avoid iteration while calculating the netlink attribute size.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h |  3 +++
 net/8021q/vlan_dev.c    | 27 +++++++++++++++++++++------
 2 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index aeddb49193f9..b46d4225f74e 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -112,7 +112,10 @@ struct vlan_dev_info {
 	/** This will be the mapping that correlates skb->priority to
 	 * 3 bits of VLAN QOS tags...
 	 */
+	unsigned int nr_ingress_mappings;
 	u32 ingress_priority_map[8];
+
+	unsigned int nr_egress_mappings;
 	struct vlan_priority_tci_mapping *egress_priority_map[16]; /* hash table */
 
 	unsigned short vlan_id;        /*  The VLAN Identifier for this interface. */
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 05a23601f670..4f6ede752a37 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -537,35 +537,50 @@ int vlan_dev_change_mtu(struct net_device *dev, int new_mtu)
 void vlan_dev_set_ingress_priority(const struct net_device *dev,
 				   u32 skb_prio, short vlan_prio)
 {
-	VLAN_DEV_INFO(dev)->ingress_priority_map[vlan_prio & 0x7] = skb_prio;
+	struct vlan_dev_info *vlan = VLAN_DEV_INFO(dev);
+
+	if (vlan->ingress_priority_map[vlan_prio & 0x7] && !skb_prio)
+		vlan->nr_ingress_mappings--;
+	else if (!vlan->ingress_priority_map[vlan_prio & 0x7] && skb_prio)
+		vlan->nr_ingress_mappings++;
+
+	vlan->ingress_priority_map[vlan_prio & 0x7] = skb_prio;
 }
 
 int vlan_dev_set_egress_priority(const struct net_device *dev,
 				 u32 skb_prio, short vlan_prio)
 {
+	struct vlan_dev_info *vlan = VLAN_DEV_INFO(dev);
 	struct vlan_priority_tci_mapping *mp = NULL;
 	struct vlan_priority_tci_mapping *np;
+	u32 vlan_qos = (vlan_prio << 13) & 0xE000;
 
 	/* See if a priority mapping exists.. */
-	mp = VLAN_DEV_INFO(dev)->egress_priority_map[skb_prio & 0xF];
+	mp = vlan->egress_priority_map[skb_prio & 0xF];
 	while (mp) {
 		if (mp->priority == skb_prio) {
-			mp->vlan_qos = ((vlan_prio << 13) & 0xE000);
+			if (mp->vlan_qos && !vlan_qos)
+				vlan->nr_egress_mappings--;
+			else if (!mp->vlan_qos && vlan_qos)
+				vlan->nr_egress_mappings++;
+			mp->vlan_qos = vlan_qos;
 			return 0;
 		}
 		mp = mp->next;
 	}
 
 	/* Create a new mapping then. */
-	mp = VLAN_DEV_INFO(dev)->egress_priority_map[skb_prio & 0xF];
+	mp = vlan->egress_priority_map[skb_prio & 0xF];
 	np = kmalloc(sizeof(struct vlan_priority_tci_mapping), GFP_KERNEL);
 	if (!np)
 		return -ENOBUFS;
 
 	np->next = mp;
 	np->priority = skb_prio;
-	np->vlan_qos = ((vlan_prio << 13) & 0xE000);
-	VLAN_DEV_INFO(dev)->egress_priority_map[skb_prio & 0xF] = np;
+	np->vlan_qos = vlan_qos;
+	vlan->egress_priority_map[skb_prio & 0xF] = np;
+	if (vlan_qos)
+		vlan->nr_egress_mappings++;
 	return 0;
 }
 
-- 
cgit v1.3-8-gc7d7


From a4bf3af4ac46802436d352ef409cee4fe80445b3 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 13 Jun 2007 12:07:37 -0700
Subject: [VLAN]: Introduce symbolic constants for flag values

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h |  4 ++++
 net/8021q/vlan.c        |  2 +-
 net/8021q/vlan_dev.c    | 13 +++++++------
 3 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index b46d4225f74e..c7912876a210 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -398,6 +398,10 @@ enum vlan_ioctl_cmds {
 	GET_VLAN_VID_CMD /* Get the VID of this VLAN (specified by name) */
 };
 
+enum vlan_flags {
+	VLAN_FLAG_REORDER_HDR	= 0x1,
+};
+
 enum vlan_name_types {
 	VLAN_NAME_TYPE_PLUS_VID, /* Name will look like:  vlan0005 */
 	VLAN_NAME_TYPE_RAW_PLUS_VID, /* name will look like:  eth1.0005 */
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 5801993182b4..f12f914edf02 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -565,7 +565,7 @@ static int register_vlan_device(struct net_device *real_dev,
 	VLAN_DEV_INFO(new_dev)->vlan_id = VLAN_ID; /* 1 through VLAN_VID_MASK */
 	VLAN_DEV_INFO(new_dev)->real_dev = real_dev;
 	VLAN_DEV_INFO(new_dev)->dent = NULL;
-	VLAN_DEV_INFO(new_dev)->flags = 1;
+	VLAN_DEV_INFO(new_dev)->flags = VLAN_FLAG_REORDER_HDR;
 
 	err = register_vlan_dev(new_dev);
 	if (err < 0)
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 4f6ede752a37..95afe387b952 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -73,7 +73,7 @@ int vlan_dev_rebuild_header(struct sk_buff *skb)
 
 static inline struct sk_buff *vlan_check_reorder_header(struct sk_buff *skb)
 {
-	if (VLAN_DEV_INFO(skb->dev)->flags & 1) {
+	if (VLAN_DEV_INFO(skb->dev)->flags & VLAN_FLAG_REORDER_HDR) {
 		if (skb_shared(skb) || skb_cloned(skb)) {
 			struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
 			kfree_skb(skb);
@@ -350,7 +350,8 @@ int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 	 * header shuffling in the hard_start_xmit.  Users can turn off this
 	 * REORDER behaviour with the vconfig tool.
 	 */
-	build_vlan_header = ((VLAN_DEV_INFO(dev)->flags & 1) == 0);
+	if (!(VLAN_DEV_INFO(dev)->flags & VLAN_FLAG_REORDER_HDR))
+		build_vlan_header = 1;
 
 	if (build_vlan_header) {
 		vhdr = (struct vlan_hdr *) skb_push(skb, VLAN_HLEN);
@@ -584,16 +585,16 @@ int vlan_dev_set_egress_priority(const struct net_device *dev,
 	return 0;
 }
 
-/* Flags are defined in the vlan_dev_info class in include/linux/if_vlan.h file. */
+/* Flags are defined in the vlan_flags enum in include/linux/if_vlan.h file. */
 int vlan_dev_set_vlan_flag(const struct net_device *dev,
 			   u32 flag, short flag_val)
 {
 	/* verify flag is supported */
-	if (flag == 1) {
+	if (flag == VLAN_FLAG_REORDER_HDR) {
 		if (flag_val) {
-			VLAN_DEV_INFO(dev)->flags |= 1;
+			VLAN_DEV_INFO(dev)->flags |= VLAN_FLAG_REORDER_HDR;
 		} else {
-			VLAN_DEV_INFO(dev)->flags &= ~1;
+			VLAN_DEV_INFO(dev)->flags &= ~VLAN_FLAG_REORDER_HDR;
 		}
 		return 0;
 	}
-- 
cgit v1.3-8-gc7d7


From 07b5b17e157b7018d0ca40ca0d1581a23096fb45 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 13 Jun 2007 12:07:54 -0700
Subject: [VLAN]: Use rtnl_link API

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_link.h  |  34 +++++++
 net/8021q/Makefile       |   2 +-
 net/8021q/vlan.c         |  29 ++++--
 net/8021q/vlan.h         |  10 ++
 net/8021q/vlan_netlink.c | 236 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 300 insertions(+), 11 deletions(-)
 create mode 100644 net/8021q/vlan_netlink.c

(limited to 'include/linux')

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 3144babd2357..422084d18ce1 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -153,4 +153,38 @@ enum
 
 #define IFLA_INFO_MAX	(__IFLA_INFO_MAX - 1)
 
+/* VLAN section */
+
+enum
+{
+	IFLA_VLAN_UNSPEC,
+	IFLA_VLAN_ID,
+	IFLA_VLAN_FLAGS,
+	IFLA_VLAN_EGRESS_QOS,
+	IFLA_VLAN_INGRESS_QOS,
+	__IFLA_VLAN_MAX,
+};
+
+#define IFLA_VLAN_MAX	(__IFLA_VLAN_MAX - 1)
+
+struct ifla_vlan_flags {
+	__u32	flags;
+	__u32	mask;
+};
+
+enum
+{
+	IFLA_VLAN_QOS_UNSPEC,
+	IFLA_VLAN_QOS_MAPPING,
+	__IFLA_VLAN_QOS_MAX
+};
+
+#define IFLA_VLAN_QOS_MAX	(__IFLA_VLAN_QOS_MAX - 1)
+
+struct ifla_vlan_qos_mapping
+{
+	__u32 from;
+	__u32 to;
+};
+
 #endif /* _LINUX_IF_LINK_H */
diff --git a/net/8021q/Makefile b/net/8021q/Makefile
index 97feb44dbdce..10ca7f486c3a 100644
--- a/net/8021q/Makefile
+++ b/net/8021q/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_VLAN_8021Q) += 8021q.o
 
-8021q-objs := vlan.o vlan_dev.o
+8021q-objs := vlan.o vlan_dev.o vlan_netlink.o
 
 ifeq ($(CONFIG_PROC_FS),y)
 8021q-objs += vlanproc.o
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index f12f914edf02..e7583eea6fda 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -97,15 +97,22 @@ static int __init vlan_proto_init(void)
 
 	/* Register us to receive netdevice events */
 	err = register_netdevice_notifier(&vlan_notifier_block);
-	if (err < 0) {
-		dev_remove_pack(&vlan_packet_type);
-		vlan_proc_cleanup();
-		return err;
-	}
+	if (err < 0)
+		goto err1;
 
-	vlan_ioctl_set(vlan_ioctl_handler);
+	err = vlan_netlink_init();
+	if (err < 0)
+		goto err2;
 
+	vlan_ioctl_set(vlan_ioctl_handler);
 	return 0;
+
+err2:
+	unregister_netdevice_notifier(&vlan_notifier_block);
+err1:
+	vlan_proc_cleanup();
+	dev_remove_pack(&vlan_packet_type);
+	return err;
 }
 
 /* Cleanup all vlan devices
@@ -136,6 +143,7 @@ static void __exit vlan_cleanup_module(void)
 {
 	int i;
 
+	vlan_netlink_fini();
 	vlan_ioctl_set(NULL);
 
 	/* Un-register us from receiving netdevice events */
@@ -306,7 +314,7 @@ static int unregister_vlan_dev(struct net_device *real_dev,
 	return ret;
 }
 
-static int unregister_vlan_device(struct net_device *dev)
+int unregister_vlan_device(struct net_device *dev)
 {
 	int ret;
 
@@ -361,7 +369,7 @@ static int vlan_dev_init(struct net_device *dev)
 	return 0;
 }
 
-static void vlan_setup(struct net_device *new_dev)
+void vlan_setup(struct net_device *new_dev)
 {
 	SET_MODULE_OWNER(new_dev);
 
@@ -410,7 +418,7 @@ static void vlan_transfer_operstate(const struct net_device *dev, struct net_dev
 	}
 }
 
-static int vlan_check_real_dev(struct net_device *real_dev, unsigned short vlan_id)
+int vlan_check_real_dev(struct net_device *real_dev, unsigned short vlan_id)
 {
 	if (real_dev->features & NETIF_F_VLAN_CHALLENGED) {
 		printk(VLAN_DBG "%s: VLANs not supported on %s.\n",
@@ -447,7 +455,7 @@ static int vlan_check_real_dev(struct net_device *real_dev, unsigned short vlan_
 	return 0;
 }
 
-static int register_vlan_dev(struct net_device *dev)
+int register_vlan_dev(struct net_device *dev)
 {
 	struct vlan_dev_info *vlan = VLAN_DEV_INFO(dev);
 	struct net_device *real_dev = vlan->real_dev;
@@ -567,6 +575,7 @@ static int register_vlan_device(struct net_device *real_dev,
 	VLAN_DEV_INFO(new_dev)->dent = NULL;
 	VLAN_DEV_INFO(new_dev)->flags = VLAN_FLAG_REORDER_HDR;
 
+	new_dev->rtnl_link_ops = &vlan_link_ops;
 	err = register_vlan_dev(new_dev);
 	if (err < 0)
 		goto out_free_newdev;
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index b83739017e9d..fe6bb0f7d275 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -72,4 +72,14 @@ void vlan_dev_get_realdev_name(const struct net_device *dev, char *result);
 void vlan_dev_get_vid(const struct net_device *dev, unsigned short *result);
 void vlan_dev_set_multicast_list(struct net_device *vlan_dev);
 
+int vlan_check_real_dev(struct net_device *real_dev, unsigned short vlan_id);
+void vlan_setup(struct net_device *dev);
+int register_vlan_dev(struct net_device *dev);
+int unregister_vlan_device(struct net_device *dev);
+
+int vlan_netlink_init(void);
+void vlan_netlink_fini(void);
+
+extern struct rtnl_link_ops vlan_link_ops;
+
 #endif /* !(__BEN_VLAN_802_1Q_INC__) */
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
new file mode 100644
index 000000000000..844c7e43d0fa
--- /dev/null
+++ b/net/8021q/vlan_netlink.c
@@ -0,0 +1,236 @@
+/*
+ *	VLAN netlink control interface
+ *
+ * 	Copyright (c) 2007 Patrick McHardy <kaber@trash.net>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <net/netlink.h>
+#include <net/rtnetlink.h>
+#include "vlan.h"
+
+
+static const struct nla_policy vlan_policy[IFLA_VLAN_MAX + 1] = {
+	[IFLA_VLAN_ID]		= { .type = NLA_U16 },
+	[IFLA_VLAN_FLAGS]	= { .len = sizeof(struct ifla_vlan_flags) },
+	[IFLA_VLAN_EGRESS_QOS]	= { .type = NLA_NESTED },
+	[IFLA_VLAN_INGRESS_QOS] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy vlan_map_policy[IFLA_VLAN_QOS_MAX + 1] = {
+	[IFLA_VLAN_QOS_MAPPING] = { .len = sizeof(struct ifla_vlan_qos_mapping) },
+};
+
+
+static inline int vlan_validate_qos_map(struct nlattr *attr)
+{
+	if (!attr)
+		return 0;
+	return nla_validate_nested(attr, IFLA_VLAN_QOS_MAX, vlan_map_policy);
+}
+
+static int vlan_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	struct ifla_vlan_flags *flags;
+	u16 id;
+	int err;
+
+	if (!data)
+		return -EINVAL;
+
+	if (data[IFLA_VLAN_ID]) {
+		id = nla_get_u16(data[IFLA_VLAN_ID]);
+		if (id >= VLAN_VID_MASK)
+			return -ERANGE;
+	}
+	if (data[IFLA_VLAN_FLAGS]) {
+		flags = nla_data(data[IFLA_VLAN_FLAGS]);
+		if ((flags->flags & flags->mask) & ~VLAN_FLAG_REORDER_HDR)
+			return -EINVAL;
+	}
+
+	err = vlan_validate_qos_map(data[IFLA_VLAN_INGRESS_QOS]);
+	if (err < 0)
+		return err;
+	err = vlan_validate_qos_map(data[IFLA_VLAN_EGRESS_QOS]);
+	if (err < 0)
+		return err;
+	return 0;
+}
+
+static int vlan_changelink(struct net_device *dev,
+			   struct nlattr *tb[], struct nlattr *data[])
+{
+	struct vlan_dev_info *vlan = VLAN_DEV_INFO(dev);
+	struct ifla_vlan_flags *flags;
+	struct ifla_vlan_qos_mapping *m;
+	struct nlattr *attr;
+	int rem;
+
+	if (data[IFLA_VLAN_FLAGS]) {
+		flags = nla_data(data[IFLA_VLAN_FLAGS]);
+		vlan->flags = (vlan->flags & ~flags->mask) |
+			      (flags->flags & flags->mask);
+	}
+	if (data[IFLA_VLAN_INGRESS_QOS]) {
+		nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) {
+			m = nla_data(attr);
+			vlan_dev_set_ingress_priority(dev, m->to, m->from);
+		}
+	}
+	if (data[IFLA_VLAN_EGRESS_QOS]) {
+		nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) {
+			m = nla_data(attr);
+			vlan_dev_set_egress_priority(dev, m->from, m->to);
+		}
+	}
+	return 0;
+}
+
+static int vlan_newlink(struct net_device *dev,
+			struct nlattr *tb[], struct nlattr *data[])
+{
+	struct vlan_dev_info *vlan = VLAN_DEV_INFO(dev);
+	struct net_device *real_dev;
+	int err;
+
+	if (!data[IFLA_VLAN_ID])
+		return -EINVAL;
+
+	if (!tb[IFLA_LINK])
+		return -EINVAL;
+	real_dev = __dev_get_by_index(nla_get_u32(tb[IFLA_LINK]));
+	if (!real_dev)
+		return -ENODEV;
+
+	vlan->vlan_id  = nla_get_u16(data[IFLA_VLAN_ID]);
+	vlan->real_dev = real_dev;
+	vlan->flags    = VLAN_FLAG_REORDER_HDR;
+
+	err = vlan_check_real_dev(real_dev, vlan->vlan_id);
+	if (err < 0)
+		return err;
+
+	if (!tb[IFLA_MTU])
+		dev->mtu = real_dev->mtu;
+	else if (dev->mtu > real_dev->mtu)
+		return -EINVAL;
+
+	err = vlan_changelink(dev, tb, data);
+	if (err < 0)
+		return err;
+
+	return register_vlan_dev(dev);
+}
+
+static void vlan_dellink(struct net_device *dev)
+{
+	unregister_vlan_device(dev);
+}
+
+static inline size_t vlan_qos_map_size(unsigned int n)
+{
+	if (n == 0)
+		return 0;
+	/* IFLA_VLAN_{EGRESS,INGRESS}_QOS + n * IFLA_VLAN_QOS_MAPPING */
+	return nla_total_size(sizeof(struct nlattr)) +
+	       nla_total_size(sizeof(struct ifla_vlan_qos_mapping)) * n;
+}
+
+static size_t vlan_get_size(const struct net_device *dev)
+{
+	struct vlan_dev_info *vlan = VLAN_DEV_INFO(dev);
+
+	return nla_total_size(2) +	/* IFLA_VLAN_ID */
+	       vlan_qos_map_size(vlan->nr_ingress_mappings) +
+	       vlan_qos_map_size(vlan->nr_egress_mappings);
+}
+
+static int vlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct vlan_dev_info *vlan = VLAN_DEV_INFO(dev);
+	struct vlan_priority_tci_mapping *pm;
+	struct ifla_vlan_flags f;
+	struct ifla_vlan_qos_mapping m;
+	struct nlattr *nest;
+	unsigned int i;
+
+	NLA_PUT_U16(skb, IFLA_VLAN_ID, VLAN_DEV_INFO(dev)->vlan_id);
+	if (vlan->flags) {
+		f.flags = vlan->flags;
+		f.mask  = ~0;
+		NLA_PUT(skb, IFLA_VLAN_FLAGS, sizeof(f), &f);
+	}
+	if (vlan->nr_ingress_mappings) {
+		nest = nla_nest_start(skb, IFLA_VLAN_INGRESS_QOS);
+		if (nest == NULL)
+			goto nla_put_failure;
+
+		for (i = 0; i < ARRAY_SIZE(vlan->ingress_priority_map); i++) {
+			if (!vlan->ingress_priority_map[i])
+				continue;
+
+			m.from = i;
+			m.to   = vlan->ingress_priority_map[i];
+			NLA_PUT(skb, IFLA_VLAN_QOS_MAPPING,
+				sizeof(m), &m);
+		}
+		nla_nest_end(skb, nest);
+	}
+
+	if (vlan->nr_egress_mappings) {
+		nest = nla_nest_start(skb, IFLA_VLAN_EGRESS_QOS);
+		if (nest == NULL)
+			goto nla_put_failure;
+
+		for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) {
+			for (pm = vlan->egress_priority_map[i]; pm;
+			     pm = pm->next) {
+				if (!pm->vlan_qos)
+					continue;
+
+				m.from = pm->priority;
+				m.to   = (pm->vlan_qos >> 13) & 0x7;
+				NLA_PUT(skb, IFLA_VLAN_QOS_MAPPING,
+					sizeof(m), &m);
+			}
+		}
+		nla_nest_end(skb, nest);
+	}
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+struct rtnl_link_ops vlan_link_ops __read_mostly = {
+	.kind		= "vlan",
+	.maxtype	= IFLA_VLAN_MAX,
+	.policy		= vlan_policy,
+	.priv_size	= sizeof(struct vlan_dev_info),
+	.setup		= vlan_setup,
+	.validate	= vlan_validate,
+	.newlink	= vlan_newlink,
+	.changelink	= vlan_changelink,
+	.dellink	= vlan_dellink,
+	.get_size	= vlan_get_size,
+	.fill_info	= vlan_fill_info,
+};
+
+int __init vlan_netlink_init(void)
+{
+	return rtnl_link_register(&vlan_link_ops);
+}
+
+void __exit vlan_netlink_fini(void)
+{
+	rtnl_link_unregister(&vlan_link_ops);
+}
+
+MODULE_ALIAS_RTNL_LINK("vlan");
-- 
cgit v1.3-8-gc7d7


From f1c91da44728fba24927e44056a56e507c11cf7b Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sat, 16 Jun 2007 12:38:51 -0300
Subject: [KTIME]: Introduce ktime_us_delta

This provides a reusable time difference function which returns the difference in
microseconds, as often used in the DCCP code.

Commiter note: renamed ktime_delta to ktime_us_delta and put it in ktime.h.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
---
 include/linux/ktime.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 2b139f66027f..923665958f90 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -279,6 +279,11 @@ static inline s64 ktime_to_us(const ktime_t kt)
 	return (s64) tv.tv_sec * USEC_PER_SEC + tv.tv_usec;
 }
 
+static inline s64 ktime_us_delta(const ktime_t later, const ktime_t earlier)
+{
+       return ktime_to_us(ktime_sub(later, earlier));
+}
+
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
-- 
cgit v1.3-8-gc7d7


From 1e180f726a58089d15637b5495fecbad8c50c833 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Date: Sat, 16 Jun 2007 12:39:38 -0300
Subject: [KTIME]: Introduce ktime_add_us

Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
---
 include/linux/ktime.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 923665958f90..dae7143644fe 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -284,6 +284,11 @@ static inline s64 ktime_us_delta(const ktime_t later, const ktime_t earlier)
        return ktime_to_us(ktime_sub(later, earlier));
 }
 
+static inline ktime_t ktime_add_us(const ktime_t kt, const u64 usec)
+{
+	return ktime_add_ns(kt, usec * 1000);
+}
+
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
-- 
cgit v1.3-8-gc7d7


From 334a8132d9950f769f390f0f35c233d099688e7a Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 25 Jun 2007 04:35:20 -0700
Subject: [SKBUFF]: Keep track of writable header len of headerless clones

Currently NAT (and others) that want to modify cloned skbs copy them,
even if in the vast majority of cases its not necessary because the
skb is a clone made by TCP and the portion NAT wants to modify is
actually writable because TCP release the header reference before
cloning.

The problem is that there is no clean way for NAT to find out how
long the writable header area is, so this patch introduces skb->hdr_len
to hold this length. When a headerless skb is cloned skb->hdr_len
is set to the current headroom, for regular clones it is copied from
the original. A new function skb_clone_writable(skb, len) returns
whether the skb is writable up to len bytes from skb->data. To avoid
enlarging the skb the mac_len field is reduced to 16 bit and the
new hdr_len field is put in the remaining 16 bit.

I've done a few rough benchmarks of NAT (not with this exact patch,
but a very similar one). As expected it saves huge amounts of system
time in case of sendfile, bringing it down to basically the same
amount as without NAT, with sendmsg it only helps on loopback,
probably because of the large MTU.

Transmit a 1GB file using sendfile/sendmsg over eth0/lo with and
without NAT:

- sendfile eth0, no NAT:	sys     0m0.388s
- sendfile eth0, NAT:		sys     0m1.835s
- sendfile eth0: NAT + path:	sys     0m0.370s	(~ -80%)

- sendfile lo, no NAT:		sys     0m0.258s
- sendfile lo, NAT:		sys     0m2.609s
- sendfile lo, NAT + patch:	sys     0m0.260s	(~ -90%)

- sendmsg eth0, no NAT:		sys     0m2.508s
- sendmsg eth0, NAT:		sys     0m2.539s
- sendmsg eth0, NAT + patch:	sys     0m2.445s	(no change)

- sendmsg lo, no NAT:		sys	0m2.151s
- sendmsg lo, NAT:		sys     0m3.557s
- sendmsg lo, NAT + patch:	sys     0m2.159s	(~ -40%)

I expect other users can see a similar performance improvement,
packet mangling iptables targets, ipip and ip_gre come to mind ..

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 24 ++++++++++++++++++++----
 net/core/skbuff.c      |  2 ++
 net/netfilter/core.c   |  4 +++-
 3 files changed, 25 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6f0b2f7d0010..881fe80f01d0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -147,8 +147,8 @@ struct skb_shared_info {
 
 /* We divide dataref into two halves.  The higher 16 bits hold references
  * to the payload part of skb->data.  The lower 16 bits hold references to
- * the entire skb->data.  It is up to the users of the skb to agree on
- * where the payload starts.
+ * the entire skb->data.  A clone of a headerless skb holds the length of
+ * the header in skb->hdr_len.
  *
  * All users must obey the rule that the skb->data reference count must be
  * greater than or equal to the payload reference count.
@@ -206,6 +206,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@len: Length of actual data
  *	@data_len: Data length
  *	@mac_len: Length of link layer header
+ *	@hdr_len: writable header length of cloned skb
  *	@csum: Checksum (must include start/offset pair)
  *	@csum_start: Offset from skb->head where checksumming should start
  *	@csum_offset: Offset from csum_start where checksum should be stored
@@ -260,8 +261,9 @@ struct sk_buff {
 	char			cb[48];
 
 	unsigned int		len,
-				data_len,
-				mac_len;
+				data_len;
+	__u16			mac_len,
+				hdr_len;
 	union {
 		__wsum		csum;
 		struct {
@@ -1321,6 +1323,20 @@ static inline struct sk_buff *netdev_alloc_skb(struct net_device *dev,
 	return __netdev_alloc_skb(dev, length, GFP_ATOMIC);
 }
 
+/**
+ *	skb_clone_writable - is the header of a clone writable
+ *	@skb: buffer to check
+ *	@len: length up to which to write
+ *
+ *	Returns true if modifying the header part of the cloned buffer
+ *	does not requires the data to be copied.
+ */
+static inline int skb_clone_writable(struct sk_buff *skb, int len)
+{
+	return !skb_header_cloned(skb) &&
+	       skb_headroom(skb) + len <= skb->hdr_len;
+}
+
 /**
  *	skb_cow - copy header of skb when it is required
  *	@skb: buffer to cow
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3943c3ad9145..c989c3a0f907 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -415,6 +415,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 	C(csum);
 	C(local_df);
 	n->cloned = 1;
+	n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
 	n->nohdr = 0;
 	C(pkt_type);
 	C(ip_summed);
@@ -676,6 +677,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	skb->network_header   += off;
 	skb->mac_header	      += off;
 	skb->cloned   = 0;
+	skb->hdr_len  = 0;
 	skb->nohdr    = 0;
 	atomic_set(&skb_shinfo(skb)->dataref, 1);
 	return 0;
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index a84478ee2ded..3aaabec70d19 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -203,7 +203,9 @@ int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len)
 		return 0;
 
 	/* Not exclusive use of packet?  Must copy. */
-	if (skb_shared(*pskb) || skb_cloned(*pskb))
+	if (skb_cloned(*pskb) && !skb_clone_writable(*pskb, writable_len))
+		goto copy_skb;
+	if (skb_shared(*pskb))
 		goto copy_skb;
 
 	return pskb_may_pull(*pskb, writable_len);
-- 
cgit v1.3-8-gc7d7


From afdc3238ec948531205f5c5f77d2de7bae519c71 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 25 Jun 2007 14:30:16 -0700
Subject: [RTNETLINK]: Add nested compat attribute

Add a nested compat attribute type that can be used to convert
attributes that contain a structure to nested attributes in a
backwards compatible way.

The attribute looks like this:

struct {
        [ compat contents ]
        struct rtattr {
                .rta_len        = total size,
                .rta_type       = type,
        } rta;
        struct old_structure struct;

        [ nested top-level attribute ]
        struct rtattr {
                .rta_len        = nest size,
                .rta_type       = type,
        } nest_attr;

        [ optional 0 .. n nested attributes ]
        struct rtattr {
                .rta_len        = private attribute len,
                .rta_type       = private attribute typ,
        } nested_attr;
        struct nested_data data;
};

Since both userspace and kernel deal correctly with attributes that are
larger than expected old versions will just parse the compat part and
ignore the rest.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 14 ++++++++++++++
 net/core/rtnetlink.c      | 16 ++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 612785848532..6731e7f4cc0f 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -570,6 +570,8 @@ static __inline__ int rtattr_strcmp(const struct rtattr *rta, const char *str)
 }
 
 extern int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len);
+extern int rtattr_parse_nested_compat(struct rtattr *tb[], int maxattr,
+				      struct rtattr *rta, void **data, int len);
 
 #define rtattr_parse_nested(tb, max, rta) \
 	rtattr_parse((tb), (max), RTA_DATA((rta)), RTA_PAYLOAD((rta)))
@@ -638,6 +640,18 @@ extern void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const voi
 ({	(start)->rta_len = skb_tail_pointer(skb) - (unsigned char *)(start); \
 	(skb)->len; })
 
+#define RTA_NEST_COMPAT(skb, type, attrlen, data) \
+({	struct rtattr *__start = (struct rtattr *)skb_tail_pointer(skb); \
+	RTA_PUT(skb, type, attrlen, data); \
+	RTA_NEST(skb, type); \
+	__start; })
+
+#define RTA_NEST_COMPAT_END(skb, start) \
+({	struct rtattr *__nest = (void *)(start) + NLMSG_ALIGN((start)->rta_len); \
+	(start)->rta_len = skb_tail_pointer(skb) - (unsigned char *)(start); \
+	RTA_NEST_END(skb, __nest); \
+	(skb)->len; })
+
 #define RTA_NEST_CANCEL(skb, start) \
 ({	if (start) \
 		skb_trim(skb, (unsigned char *) (start) - (skb)->data); \
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 06c0c5afabf0..c25d23ba6d5d 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -97,6 +97,21 @@ int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len)
 	return 0;
 }
 
+int rtattr_parse_nested_compat(struct rtattr *tb[], int maxattr,
+			       struct rtattr *rta, void **data, int len)
+{
+	if (RTA_PAYLOAD(rta) < len)
+		return -1;
+	*data = RTA_DATA(rta);
+
+	if (RTA_PAYLOAD(rta) >= RTA_ALIGN(len) + sizeof(struct rtattr)) {
+		rta = RTA_DATA(rta) + RTA_ALIGN(len);
+		return rtattr_parse_nested(tb, maxattr, rta);
+	}
+	memset(tb, 0, sizeof(struct rtattr *) * maxattr);
+	return 0;
+}
+
 static struct rtnl_link *rtnl_msg_handlers[NPROTO];
 
 static inline int rtm_msgindex(int msgtype)
@@ -1297,6 +1312,7 @@ void __init rtnetlink_init(void)
 EXPORT_SYMBOL(__rta_fill);
 EXPORT_SYMBOL(rtattr_strlcpy);
 EXPORT_SYMBOL(rtattr_parse);
+EXPORT_SYMBOL(rtattr_parse_nested_compat);
 EXPORT_SYMBOL(rtnetlink_put_metrics);
 EXPORT_SYMBOL(rtnl_lock);
 EXPORT_SYMBOL(rtnl_trylock);
-- 
cgit v1.3-8-gc7d7


From 2371baa4bdab3268b32009926f75e7a5d3a41506 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 26 Jun 2007 03:23:44 -0700
Subject: [RTNETLINK]: Fix rtnetlink compat attribute patch

Sent the wrong patch previously.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 8 ++++++--
 net/core/rtnetlink.c      | 8 +++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 6731e7f4cc0f..c91476ce314a 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -570,12 +570,16 @@ static __inline__ int rtattr_strcmp(const struct rtattr *rta, const char *str)
 }
 
 extern int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len);
-extern int rtattr_parse_nested_compat(struct rtattr *tb[], int maxattr,
-				      struct rtattr *rta, void **data, int len);
+extern int __rtattr_parse_nested_compat(struct rtattr *tb[], int maxattr,
+				        struct rtattr *rta, int len);
 
 #define rtattr_parse_nested(tb, max, rta) \
 	rtattr_parse((tb), (max), RTA_DATA((rta)), RTA_PAYLOAD((rta)))
 
+#define rtattr_parse_nested_compat(tb, max, rta, data, len) \
+({	data = RTA_PAYLOAD(rta) >= len ? RTA_DATA(rta) : NULL; \
+	__rtattr_parse_nested_compat(tb, max, rta, len); })
+
 extern int rtnetlink_send(struct sk_buff *skb, u32 pid, u32 group, int echo);
 extern int rtnl_unicast(struct sk_buff *skb, u32 pid);
 extern int rtnl_notify(struct sk_buff *skb, u32 pid, u32 group,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index c25d23ba6d5d..54c17e4cd28f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -97,13 +97,11 @@ int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len)
 	return 0;
 }
 
-int rtattr_parse_nested_compat(struct rtattr *tb[], int maxattr,
-			       struct rtattr *rta, void **data, int len)
+int __rtattr_parse_nested_compat(struct rtattr *tb[], int maxattr,
+			         struct rtattr *rta, int len)
 {
 	if (RTA_PAYLOAD(rta) < len)
 		return -1;
-	*data = RTA_DATA(rta);
-
 	if (RTA_PAYLOAD(rta) >= RTA_ALIGN(len) + sizeof(struct rtattr)) {
 		rta = RTA_DATA(rta) + RTA_ALIGN(len);
 		return rtattr_parse_nested(tb, maxattr, rta);
@@ -1312,7 +1310,7 @@ void __init rtnetlink_init(void)
 EXPORT_SYMBOL(__rta_fill);
 EXPORT_SYMBOL(rtattr_strlcpy);
 EXPORT_SYMBOL(rtattr_parse);
-EXPORT_SYMBOL(rtattr_parse_nested_compat);
+EXPORT_SYMBOL(__rtattr_parse_nested_compat);
 EXPORT_SYMBOL(rtnetlink_put_metrics);
 EXPORT_SYMBOL(rtnl_lock);
 EXPORT_SYMBOL(rtnl_trylock);
-- 
cgit v1.3-8-gc7d7


From 59fbb3a61e02deaeaa4fb50792217921f3002d64 Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Tue, 26 Jun 2007 23:56:32 -0700
Subject: [IPV6] MIP6: Loadable module support for MIPv6.

This patch makes MIPv6 loadable module named "mip6".

Here is a modprobe.conf(5) example to load it automatically
when user application uses XFRM state for MIPv6:

alias xfrm-type-10-43 mip6
alias xfrm-type-10-60 mip6

Some MIPv6 feature is not included by this modular, however,
it should not be affected to other features like either IPsec
or IPv6 with and without the patch.
We may discuss XFRM, MH (RAW socket) and ancillary data/sockopt
separately for future work.

Loadable features:
* MH receiving check (to send ICMP error back)
* RO header parsing and building (i.e. RH2 and HAO in DSTOPTS)
* XFRM policy/state database handling for RO

These are NOT covered as loadable:
* Home Address flags and its rule on source address selection
* XFRM sub policy (depends on its own kernel option)
* XFRM functions to receive RO as IPv6 extension header
* MH sending/receiving through raw socket if user application
  opens it (since raw socket allows to do so)
* RH2 sending as ancillary data
* RH2 operation with setsockopt(2)

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  2 +-
 include/net/addrconf.h   |  2 +-
 include/net/mip6.h       |  4 ----
 include/net/rawv6.h      |  9 +++++++++
 net/ipv6/Kconfig         |  2 +-
 net/ipv6/Makefile        |  2 +-
 net/ipv6/addrconf.c      |  4 ++--
 net/ipv6/af_inet6.c      | 10 +---------
 net/ipv6/ah6.c           |  8 ++++----
 net/ipv6/datagram.c      |  2 +-
 net/ipv6/exthdrs.c       | 19 ++++++++++---------
 net/ipv6/icmp.c          |  2 +-
 net/ipv6/ip6_output.c    |  2 +-
 net/ipv6/ipv6_sockglue.c |  2 +-
 net/ipv6/mip6.c          | 22 +++++++++++++++++++---
 net/ipv6/raw.c           | 34 +++++++++++++++++++++++++++++++---
 net/ipv6/xfrm6_policy.c  |  4 ++--
 net/ipv6/xfrm6_state.c   |  4 ++--
 18 files changed, 88 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 648bd1f0912d..213b63be3c8f 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -247,7 +247,7 @@ struct inet6_skb_parm {
 	__u16			lastopt;
 	__u32			nhoff;
 	__u16			flags;
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 	__u16			dsthao;
 #endif
 
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index f3531d0bcd05..33b593e17441 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -61,7 +61,7 @@ extern int			addrconf_set_dstaddr(void __user *arg);
 extern int			ipv6_chk_addr(struct in6_addr *addr,
 					      struct net_device *dev,
 					      int strict);
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 extern int			ipv6_chk_home_addr(struct in6_addr *addr);
 #endif
 extern struct inet6_ifaddr *	ipv6_get_ifaddr(struct in6_addr *addr,
diff --git a/include/net/mip6.h b/include/net/mip6.h
index 68263c6d9996..63272610a24a 100644
--- a/include/net/mip6.h
+++ b/include/net/mip6.h
@@ -54,8 +54,4 @@ struct ip6_mh {
 #define IP6_MH_TYPE_BERROR	7   /* Binding Error */
 #define IP6_MH_TYPE_MAX		IP6_MH_TYPE_BERROR
 
-extern int mip6_init(void);
-extern void mip6_fini(void);
-extern int mip6_mh_filter(struct sock *sk, struct sk_buff *skb);
-
 #endif
diff --git a/include/net/rawv6.h b/include/net/rawv6.h
index af8960878ef4..a5819891d525 100644
--- a/include/net/rawv6.h
+++ b/include/net/rawv6.h
@@ -3,6 +3,8 @@
 
 #ifdef __KERNEL__
 
+#include <net/protocol.h>
+
 #define RAWV6_HTABLE_SIZE	MAX_INET_PROTOS
 extern struct hlist_head raw_v6_htable[RAWV6_HTABLE_SIZE];
 extern rwlock_t raw_v6_lock;
@@ -23,6 +25,13 @@ extern void			rawv6_err(struct sock *sk,
 					  int type, int code, 
 					  int offset, __be32 info);
 
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+int rawv6_mh_filter_register(int (*filter)(struct sock *sock,
+					   struct sk_buff *skb));
+int rawv6_mh_filter_unregister(int (*filter)(struct sock *sock,
+					     struct sk_buff *skb));
+#endif
+
 #endif
 
 #endif
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 8e5d54f23b49..eb0b8085949b 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -109,7 +109,7 @@ config INET6_IPCOMP
 	  If unsure, say Y.
 
 config IPV6_MIP6
-	bool "IPv6: Mobility (EXPERIMENTAL)"
+	tristate "IPv6: Mobility (EXPERIMENTAL)"
 	depends on IPV6 && EXPERIMENTAL
 	select XFRM
 	---help---
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index bb33309044c9..87c23a73d284 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -14,7 +14,6 @@ ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
 	xfrm6_output.o
 ipv6-$(CONFIG_NETFILTER) += netfilter.o
 ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o
-ipv6-$(CONFIG_IPV6_MIP6) += mip6.o
 ipv6-$(CONFIG_PROC_FS) += proc.o
 
 ipv6-objs += $(ipv6-y)
@@ -28,6 +27,7 @@ obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o
 obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o
 obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o
 obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o
+obj-$(CONFIG_IPV6_MIP6) += mip6.o
 obj-$(CONFIG_NETFILTER)	+= netfilter/
 
 obj-$(CONFIG_IPV6_SIT) += sit.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 79b79f3de24c..11c0028863b0 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1034,7 +1034,7 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev,
 			}
 
 			/* Rule 4: Prefer home address */
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 			if (hiscore.rule < 4) {
 				if (ifa_result->flags & IFA_F_HOMEADDRESS)
 					hiscore.attrs |= IPV6_SADDR_SCORE_HOA;
@@ -2835,7 +2835,7 @@ void if6_proc_exit(void)
 }
 #endif	/* CONFIG_PROC_FS */
 
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 /* Check if address is a home address configured on any interface. */
 int ipv6_chk_home_addr(struct in6_addr *addr)
 {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 6dd377253cf7..eed09373a45d 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -58,9 +58,6 @@
 #ifdef CONFIG_IPV6_TUNNEL
 #include <net/ip6_tunnel.h>
 #endif
-#ifdef CONFIG_IPV6_MIP6
-#include <net/mip6.h>
-#endif
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -853,9 +850,6 @@ static int __init inet6_init(void)
 	ipv6_frag_init();
 	ipv6_nodata_init();
 	ipv6_destopt_init();
-#ifdef CONFIG_IPV6_MIP6
-	mip6_init();
-#endif
 
 	/* Init v6 transport protocols. */
 	udpv6_init();
@@ -921,9 +915,7 @@ static void __exit inet6_exit(void)
 
 	/* Cleanup code parts. */
 	ipv6_packet_cleanup();
-#ifdef CONFIG_IPV6_MIP6
-	mip6_fini();
-#endif
+
 	addrconf_cleanup();
 	ip6_flowlabel_cleanup();
 	ip6_route_cleanup();
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 01fa302d281e..cc6884a40be4 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -74,7 +74,7 @@ bad:
 	return 0;
 }
 
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 /**
  *	ipv6_rearrange_destopt - rearrange IPv6 destination options header
  *	@iph: IPv6 header
@@ -228,7 +228,7 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
 	u8 nexthdr;
 	char tmp_base[8];
 	struct {
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 		struct in6_addr saddr;
 #endif
 		struct in6_addr daddr;
@@ -255,7 +255,7 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
 			err = -ENOMEM;
 			goto error;
 		}
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 		memcpy(tmp_ext, &top_iph->saddr, extlen);
 #else
 		memcpy(tmp_ext, &top_iph->daddr, extlen);
@@ -294,7 +294,7 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
 
 	memcpy(top_iph, tmp_base, sizeof(tmp_base));
 	if (tmp_ext) {
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 		memcpy(&top_iph->saddr, tmp_ext, extlen);
 #else
 		memcpy(&top_iph->daddr, tmp_ext, extlen);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index b1fe7ac5dc90..ba1386dd4168 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -658,7 +658,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
 
 			switch (rthdr->type) {
 			case IPV6_SRCRT_TYPE_0:
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 			case IPV6_SRCRT_TYPE_2:
 #endif
 				break;
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 14be0b9b77a5..173a4bb52255 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -42,7 +42,7 @@
 #include <net/ndisc.h>
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 #include <net/xfrm.h>
 #endif
 
@@ -90,6 +90,7 @@ int ipv6_find_tlv(struct sk_buff *skb, int offset, int type)
  bad:
 	return -1;
 }
+EXPORT_SYMBOL_GPL(ipv6_find_tlv);
 
 /*
  *	Parsing tlv encoded headers.
@@ -196,7 +197,7 @@ bad:
   Destination options header.
  *****************************/
 
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 static int ipv6_dest_hao(struct sk_buff **skbp, int optoff)
 {
 	struct sk_buff *skb = *skbp;
@@ -270,7 +271,7 @@ static int ipv6_dest_hao(struct sk_buff **skbp, int optoff)
 #endif
 
 static struct tlvtype_proc tlvprocdestopt_lst[] = {
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 	{
 		.type	= IPV6_TLV_HAO,
 		.func	= ipv6_dest_hao,
@@ -283,7 +284,7 @@ static int ipv6_destopt_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	struct inet6_skb_parm *opt = IP6CB(skb);
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 	__u16 dstbuf;
 #endif
 	struct dst_entry *dst;
@@ -298,7 +299,7 @@ static int ipv6_destopt_rcv(struct sk_buff **skbp)
 	}
 
 	opt->lastopt = opt->dst1 = skb_network_header_len(skb);
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 	dstbuf = opt->dst1;
 #endif
 
@@ -308,7 +309,7 @@ static int ipv6_destopt_rcv(struct sk_buff **skbp)
 		skb = *skbp;
 		skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3;
 		opt = IP6CB(skb);
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 		opt->nhoff = dstbuf;
 #else
 		opt->nhoff = opt->dst1;
@@ -427,7 +428,7 @@ static int ipv6_rthdr_rcv(struct sk_buff **skbp)
 looped_back:
 	if (hdr->segments_left == 0) {
 		switch (hdr->type) {
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 		case IPV6_SRCRT_TYPE_2:
 			/* Silently discard type 2 header unless it was
 			 * processed by own
@@ -463,7 +464,7 @@ looped_back:
 			return -1;
 		}
 		break;
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 	case IPV6_SRCRT_TYPE_2:
 		/* Silently discard invalid RTH type 2 */
 		if (hdr->hdrlen != 2 || hdr->segments_left != 1) {
@@ -520,7 +521,7 @@ looped_back:
 	addr += i - 1;
 
 	switch (hdr->type) {
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 	case IPV6_SRCRT_TYPE_2:
 		if (xfrm6_input_addr(skb, (xfrm_address_t *)addr,
 				     (xfrm_address_t *)&ipv6_hdr(skb)->saddr,
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index e9bcce9e7bdf..4765a29f98a8 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -272,7 +272,7 @@ static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, st
 	return 0;
 }
 
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 static void mip6_addr_swap(struct sk_buff *skb)
 {
 	struct ipv6hdr *iph = ipv6_hdr(skb);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 4704b5fc3085..31dafaf0b652 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -543,7 +543,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 			found_rhdr = 1;
 			break;
 		case NEXTHDR_DEST:
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 				break;
 #endif
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index aa3d07c52a8f..b636c38411fb 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -417,7 +417,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			struct ipv6_rt_hdr *rthdr = opt->srcrt;
 			switch (rthdr->type) {
 			case IPV6_SRCRT_TYPE_0:
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 			case IPV6_SRCRT_TYPE_2:
 #endif
 				break;
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
index 13b7160fb892..20c78ecff6a1 100644
--- a/net/ipv6/mip6.c
+++ b/net/ipv6/mip6.c
@@ -30,6 +30,7 @@
 #include <net/sock.h>
 #include <net/ipv6.h>
 #include <net/ip6_checksum.h>
+#include <net/rawv6.h>
 #include <net/xfrm.h>
 #include <net/mip6.h>
 
@@ -86,7 +87,7 @@ static int mip6_mh_len(int type)
 	return len;
 }
 
-int mip6_mh_filter(struct sock *sk, struct sk_buff *skb)
+static int mip6_mh_filter(struct sock *sk, struct sk_buff *skb)
 {
 	struct ip6_mh *mh;
 
@@ -471,7 +472,7 @@ static struct xfrm_type mip6_rthdr_type =
 	.remote_addr	= mip6_xfrm_addr,
 };
 
-int __init mip6_init(void)
+static int __init mip6_init(void)
 {
 	printk(KERN_INFO "Mobile IPv6\n");
 
@@ -483,18 +484,33 @@ int __init mip6_init(void)
 		printk(KERN_INFO "%s: can't add xfrm type(rthdr)\n", __FUNCTION__);
 		goto mip6_rthdr_xfrm_fail;
 	}
+	if (rawv6_mh_filter_register(mip6_mh_filter) < 0) {
+		printk(KERN_INFO "%s: can't add rawv6 mh filter\n", __FUNCTION__);
+		goto mip6_rawv6_mh_fail;
+	}
+
+
 	return 0;
 
+ mip6_rawv6_mh_fail:
+	xfrm_unregister_type(&mip6_rthdr_type, AF_INET6);
  mip6_rthdr_xfrm_fail:
 	xfrm_unregister_type(&mip6_destopt_type, AF_INET6);
  mip6_destopt_xfrm_fail:
 	return -EAGAIN;
 }
 
-void __exit mip6_fini(void)
+static void __exit mip6_fini(void)
 {
+	if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0)
+		printk(KERN_INFO "%s: can't remove rawv6 mh filter\n", __FUNCTION__);
 	if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0)
 		printk(KERN_INFO "%s: can't remove xfrm type(rthdr)\n", __FUNCTION__);
 	if (xfrm_unregister_type(&mip6_destopt_type, AF_INET6) < 0)
 		printk(KERN_INFO "%s: can't remove xfrm type(destopt)\n", __FUNCTION__);
 }
+
+module_init(mip6_init);
+module_exit(mip6_fini);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index a22c9c93d931..aac6aeb8de8c 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -49,7 +49,7 @@
 #include <net/udp.h>
 #include <net/inet_common.h>
 #include <net/tcp_states.h>
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 #include <net/mip6.h>
 #endif
 
@@ -137,6 +137,28 @@ static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+static int (*mh_filter)(struct sock *sock, struct sk_buff *skb);
+
+int rawv6_mh_filter_register(int (*filter)(struct sock *sock,
+					   struct sk_buff *skb))
+{
+	rcu_assign_pointer(mh_filter, filter);
+	return 0;
+}
+EXPORT_SYMBOL(rawv6_mh_filter_register);
+
+int rawv6_mh_filter_unregister(int (*filter)(struct sock *sock,
+					     struct sk_buff *skb))
+{
+	rcu_assign_pointer(mh_filter, NULL);
+	synchronize_rcu();
+	return 0;
+}
+EXPORT_SYMBOL(rawv6_mh_filter_unregister);
+
+#endif
+
 /*
  *	demultiplex raw sockets.
  *	(should consider queueing the skb in the sock receive_queue
@@ -178,16 +200,22 @@ int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 		case IPPROTO_ICMPV6:
 			filtered = icmpv6_filter(sk, skb);
 			break;
-#ifdef CONFIG_IPV6_MIP6
+
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 		case IPPROTO_MH:
+		{
 			/* XXX: To validate MH only once for each packet,
 			 * this is placed here. It should be after checking
 			 * xfrm policy, however it doesn't. The checking xfrm
 			 * policy is placed in rawv6_rcv() because it is
 			 * required for each socket.
 			 */
-			filtered = mip6_mh_filter(sk, skb);
+			int (*filter)(struct sock *sock, struct sk_buff *skb);
+
+			filter = rcu_dereference(mh_filter);
+			filtered = filter ? filter(sk, skb) : 0;
 			break;
+		}
 #endif
 		default:
 			filtered = 0;
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 1faa2ea80afc..3ec0c4770ee3 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -18,7 +18,7 @@
 #include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 #include <net/mip6.h>
 #endif
 
@@ -318,7 +318,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl)
 			fl->proto = nexthdr;
 			return;
 
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 		case IPPROTO_MH:
 			if (pskb_may_pull(skb, nh + offset + 3 - skb->data)) {
 				struct ip6_mh *mh;
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index baa461b9f74e..cdadb4847469 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -65,7 +65,7 @@ __xfrm6_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n)
 		goto end;
 
 	/* Rule 2: select MIPv6 RO or inbound trigger */
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 	for (i = 0; i < n; i++) {
 		if (src[i] &&
 		    (src[i]->props.mode == XFRM_MODE_ROUTEOPTIMIZATION ||
@@ -130,7 +130,7 @@ __xfrm6_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n)
 		goto end;
 
 	/* Rule 2: select MIPv6 RO or inbound trigger */
-#ifdef CONFIG_IPV6_MIP6
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 	for (i = 0; i < n; i++) {
 		if (src[i] &&
 		    (src[i]->mode == XFRM_MODE_ROUTEOPTIMIZATION ||
-- 
cgit v1.3-8-gc7d7


From d212f87b068c9d72065ef579d85b5ee6b8b59381 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Wed, 27 Jun 2007 00:47:37 -0700
Subject: [NET]: IPV6 checksum offloading in network devices

The existing model for checksum offload does not correctly handle
devices that can offload IPV4 and IPV6 only. The NETIF_F_HW_CSUM flag
implies device can do any arbitrary protocol.

This patch:
 * adds NETIF_F_IPV6_CSUM for those devices
 * fixes bnx2 and tg3 devices that need it
 * add NETIF_F_IPV6_CSUM to ipv6 output (incl GSO)
 * fixes assumptions about NETIF_F_ALL_CSUM in nat
 * adjusts bridge union of checksumming computation

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bnx2.c                 |  6 +++---
 drivers/net/tg3.c                  |  7 +++----
 include/linux/netdevice.h          |  8 ++++++--
 net/bridge/br_if.c                 | 10 +++++++++-
 net/core/dev.c                     | 24 +++++++++++++++++++++---
 net/ipv4/af_inet.c                 |  3 +++
 net/ipv4/ip_output.c               |  2 +-
 net/ipv4/netfilter/nf_nat_helper.c |  4 ++--
 net/ipv6/ipv6_sockglue.c           |  2 +-
 9 files changed, 49 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index ce3ed67a878e..0f4f76fda26f 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -6490,10 +6490,10 @@ bnx2_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	memcpy(dev->perm_addr, bp->mac_addr, 6);
 	bp->name = board_info[ent->driver_data].name;
 
+	dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG;
 	if (CHIP_NUM(bp) == CHIP_NUM_5709)
-		dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
-	else
-		dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG;
+		dev->features |= NETIF_F_IPV6_CSUM;
+
 #ifdef BCM_VLAN
 	dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
 #endif
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 2f3184184ad9..3a43426ced32 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -11944,12 +11944,11 @@ static int __devinit tg3_init_one(struct pci_dev *pdev,
 	 * checksumming.
 	 */
 	if ((tp->tg3_flags & TG3_FLAG_BROKEN_CHECKSUMS) == 0) {
+		dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG;
 		if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5755 ||
 		    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787)
-			dev->features |= NETIF_F_HW_CSUM;
-		else
-			dev->features |= NETIF_F_IP_CSUM;
-		dev->features |= NETIF_F_SG;
+			dev->features |= NETIF_F_IPV6_CSUM;
+
 		tp->tg3_flags |= TG3_FLAG_RX_CHECKSUMS;
 	} else
 		tp->tg3_flags &= ~TG3_FLAG_RX_CHECKSUMS;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e7913ee5581c..7a8f22fb4eee 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -314,9 +314,10 @@ struct net_device
 	/* Net device features */
 	unsigned long		features;
 #define NETIF_F_SG		1	/* Scatter/gather IO. */
-#define NETIF_F_IP_CSUM		2	/* Can checksum only TCP/UDP over IPv4. */
+#define NETIF_F_IP_CSUM		2	/* Can checksum TCP/UDP over IPv4. */
 #define NETIF_F_NO_CSUM		4	/* Does not require checksum. F.e. loopack. */
 #define NETIF_F_HW_CSUM		8	/* Can checksum all the packets. */
+#define NETIF_F_IPV6_CSUM	16	/* Can checksum TCP/UDP over IPV6 */
 #define NETIF_F_HIGHDMA		32	/* Can DMA to high memory. */
 #define NETIF_F_FRAGLIST	64	/* Scatter/gather IO. */
 #define NETIF_F_HW_VLAN_TX	128	/* Transmit VLAN hw acceleration */
@@ -338,8 +339,11 @@ struct net_device
 	/* List of features with software fallbacks. */
 #define NETIF_F_GSO_SOFTWARE	(NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6)
 
+
 #define NETIF_F_GEN_CSUM	(NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
-#define NETIF_F_ALL_CSUM	(NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM)
+#define NETIF_F_V4_CSUM		(NETIF_F_GEN_CSUM | NETIF_F_IP_CSUM)
+#define NETIF_F_V6_CSUM		(NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM)
+#define NETIF_F_ALL_CSUM	(NETIF_F_V4_CSUM | NETIF_F_V6_CSUM)
 
 	struct net_device	*next_sched;
 
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 849deaf14108..7b4ce9113be2 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -368,10 +368,18 @@ void br_features_recompute(struct net_bridge *br)
 	list_for_each_entry(p, &br->port_list, list) {
 		unsigned long feature = p->dev->features;
 
+		/* if device needs checksumming, downgrade to hw checksumming */
 		if (checksum & NETIF_F_NO_CSUM && !(feature & NETIF_F_NO_CSUM))
 			checksum ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
+
+		/* if device can't do all checksum, downgrade to ipv4/ipv6 */
 		if (checksum & NETIF_F_HW_CSUM && !(feature & NETIF_F_HW_CSUM))
-			checksum ^= NETIF_F_HW_CSUM | NETIF_F_IP_CSUM;
+			checksum ^= NETIF_F_HW_CSUM
+				| NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+
+		if (checksum & NETIF_F_IPV6_CSUM && !(feature & NETIF_F_IPV6_CSUM))
+			checksum &= ~NETIF_F_IPV6_CSUM;
+
 		if (!(feature & NETIF_F_IP_CSUM))
 			checksum = 0;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index ee051bb398a0..a0a46e7ed137 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1509,9 +1509,11 @@ int dev_queue_xmit(struct sk_buff *skb)
 		skb_set_transport_header(skb, skb->csum_start -
 					      skb_headroom(skb));
 
-		if (!(dev->features & NETIF_F_GEN_CSUM) &&
-		    (!(dev->features & NETIF_F_IP_CSUM) ||
-		     skb->protocol != htons(ETH_P_IP)))
+		if (!(dev->features & NETIF_F_GEN_CSUM)
+		    || ((dev->features & NETIF_F_IP_CSUM)
+			&& skb->protocol == htons(ETH_P_IP))
+		    || ((dev->features & NETIF_F_IPV6_CSUM)
+			&& skb->protocol == htons(ETH_P_IPV6)))
 			if (skb_checksum_help(skb))
 				goto out_kfree_skb;
 	}
@@ -3107,6 +3109,22 @@ int register_netdevice(struct net_device *dev)
 		}
 	}
 
+	/* Fix illegal checksum combinations */
+	if ((dev->features & NETIF_F_HW_CSUM) &&
+	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
+		       dev->name);
+		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
+	}
+
+	if ((dev->features & NETIF_F_NO_CSUM) &&
+	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
+		       dev->name);
+		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
+	}
+
+
 	/* Fix illegal SG+CSUM combinations. */
 	if ((dev->features & NETIF_F_SG) &&
 	    !(dev->features & NETIF_F_ALL_CSUM)) {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 041fba3fa0aa..06c08e5740fb 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1170,6 +1170,9 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
 	int ihl;
 	int id;
 
+	if (!(features & NETIF_F_V4_CSUM))
+		features &= ~NETIF_F_SG;
+
 	if (unlikely(skb_shinfo(skb)->gso_type &
 		     ~(SKB_GSO_TCPV4 |
 		       SKB_GSO_UDP |
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 34ea4547ebbe..a7dd343d3a03 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -837,7 +837,7 @@ int ip_append_data(struct sock *sk,
 	 */
 	if (transhdrlen &&
 	    length + fragheaderlen <= mtu &&
-	    rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
+	    rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
 	    !exthdrlen)
 		csummode = CHECKSUM_PARTIAL;
 
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 15b6e5ce3a04..b1aa5983a95b 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -178,7 +178,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff **pskb,
 	datalen = (*pskb)->len - iph->ihl*4;
 	if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) {
 		if (!(rt->rt_flags & RTCF_LOCAL) &&
-		    (*pskb)->dev->features & NETIF_F_ALL_CSUM) {
+		    (*pskb)->dev->features & NETIF_F_V4_CSUM) {
 			(*pskb)->ip_summed = CHECKSUM_PARTIAL;
 			(*pskb)->csum_start = skb_headroom(*pskb) +
 					      skb_network_offset(*pskb) +
@@ -265,7 +265,7 @@ nf_nat_mangle_udp_packet(struct sk_buff **pskb,
 
 	if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) {
 		if (!(rt->rt_flags & RTCF_LOCAL) &&
-		    (*pskb)->dev->features & NETIF_F_ALL_CSUM) {
+		    (*pskb)->dev->features & NETIF_F_V4_CSUM) {
 			(*pskb)->ip_summed = CHECKSUM_PARTIAL;
 			(*pskb)->csum_start = skb_headroom(*pskb) +
 					      skb_network_offset(*pskb) +
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index b636c38411fb..1c3506696cb5 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -123,7 +123,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, int features)
 	struct ipv6hdr *ipv6h;
 	struct inet6_protocol *ops;
 
-	if (!(features & NETIF_F_HW_CSUM))
+	if (!(features & NETIF_F_V6_CSUM))
 		features &= ~NETIF_F_SG;
 
 	if (unlikely(skb_shinfo(skb)->gso_type &
-- 
cgit v1.3-8-gc7d7


From bf742482d7a647c5c6f03f78eb35a862e159ecf5 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 27 Jun 2007 01:26:19 -0700
Subject: [NET]: dev: introduce generic net_device address lists

Introduce struct dev_addr_list and list maintenance functions
based on dev_mc_list and the related functions. This will be
used by follow-up patches for both multicast and secondary
unicast addresses.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 11 ++++++++
 net/core/dev.c            | 69 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7a8f22fb4eee..aa389c77aa3e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -177,6 +177,14 @@ struct netif_rx_stats
 
 DECLARE_PER_CPU(struct netif_rx_stats, netdev_rx_stat);
 
+struct dev_addr_list
+{
+	struct dev_addr_list	*next;
+	u8			da_addr[MAX_ADDR_LEN];
+	u8			da_addrlen;
+	int			da_users;
+	int			da_gusers;
+};
 
 /*
  *	We tag multicasts with these structures.
@@ -1008,6 +1016,9 @@ extern void		dev_mc_upload(struct net_device *dev);
 extern int 		dev_mc_delete(struct net_device *dev, void *addr, int alen, int all);
 extern int		dev_mc_add(struct net_device *dev, void *addr, int alen, int newonly);
 extern void		dev_mc_discard(struct net_device *dev);
+extern int 		__dev_addr_delete(struct dev_addr_list **list, void *addr, int alen, int all);
+extern int		__dev_addr_add(struct dev_addr_list **list, void *addr, int alen, int newonly);
+extern void		__dev_addr_discard(struct dev_addr_list **list);
 extern void		dev_set_promiscuity(struct net_device *dev, int inc);
 extern void		dev_set_allmulti(struct net_device *dev, int inc);
 extern void		netdev_state_change(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index a0a46e7ed137..18759ccdf219 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2553,6 +2553,75 @@ void dev_set_allmulti(struct net_device *dev, int inc)
 		dev_mc_upload(dev);
 }
 
+int __dev_addr_delete(struct dev_addr_list **list, void *addr, int alen,
+		      int glbl)
+{
+	struct dev_addr_list *da;
+
+	for (; (da = *list) != NULL; list = &da->next) {
+		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
+		    alen == da->da_addrlen) {
+			if (glbl) {
+				int old_glbl = da->da_gusers;
+				da->da_gusers = 0;
+				if (old_glbl == 0)
+					break;
+			}
+			if (--da->da_users)
+				return 0;
+
+			*list = da->next;
+			kfree(da);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+int __dev_addr_add(struct dev_addr_list **list, void *addr, int alen, int glbl)
+{
+	struct dev_addr_list *da;
+
+	for (da = *list; da != NULL; da = da->next) {
+		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
+		    da->da_addrlen == alen) {
+			if (glbl) {
+				int old_glbl = da->da_gusers;
+				da->da_gusers = 1;
+				if (old_glbl)
+					return 0;
+			}
+			da->da_users++;
+			return 0;
+		}
+	}
+
+	da = kmalloc(sizeof(*da), GFP_ATOMIC);
+	if (da == NULL)
+		return -ENOMEM;
+	memcpy(da->da_addr, addr, alen);
+	da->da_addrlen = alen;
+	da->da_users = 1;
+	da->da_gusers = glbl ? 1 : 0;
+	da->next = *list;
+	*list = da;
+	return 0;
+}
+
+void __dev_addr_discard(struct dev_addr_list **list)
+{
+	struct dev_addr_list *tmp;
+
+	while (*list != NULL) {
+		tmp = *list;
+		*list = tmp->next;
+		if (tmp->da_users > tmp->da_gusers)
+			printk("__dev_addr_discard: address leakage! "
+			       "da_users=%d\n", tmp->da_users);
+		kfree(tmp);
+	}
+}
+
 unsigned dev_get_flags(const struct net_device *dev)
 {
 	unsigned flags;
-- 
cgit v1.3-8-gc7d7


From 3fba5a8b1e3df2384b90493538161e83cf15dd5f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 27 Jun 2007 01:26:58 -0700
Subject: [NET]: dev_mcast: switch to generic net_device address lists

Use generic net_device address lists for multicast list handling.
Some defines are used to keep drivers working.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 17 ++++-----
 net/core/dev_mcast.c      | 96 ++++++++---------------------------------------
 2 files changed, 22 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index aa389c77aa3e..9e114e77e54d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -189,15 +189,12 @@ struct dev_addr_list
 /*
  *	We tag multicasts with these structures.
  */
- 
-struct dev_mc_list
-{	
-	struct dev_mc_list	*next;
-	__u8			dmi_addr[MAX_ADDR_LEN];
-	unsigned char		dmi_addrlen;
-	int			dmi_users;
-	int			dmi_gusers;
-};
+
+#define dev_mc_list	dev_addr_list
+#define dmi_addr	da_addr
+#define dmi_addrlen	da_addrlen
+#define dmi_users	da_users
+#define dmi_gusers	da_gusers
 
 struct hh_cache
 {
@@ -400,7 +397,7 @@ struct net_device
 	unsigned char		addr_len;	/* hardware address length	*/
 	unsigned short          dev_id;		/* for shared network cards */
 
-	struct dev_mc_list	*mc_list;	/* Multicast mac addresses	*/
+	struct dev_addr_list	*mc_list;	/* Multicast mac addresses	*/
 	int			mc_count;	/* Number of installed mcasts	*/
 	int			promiscuity;
 	int			allmulti;
diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
index 80bb2e374b76..702907434a47 100644
--- a/net/core/dev_mcast.c
+++ b/net/core/dev_mcast.c
@@ -102,47 +102,20 @@ void dev_mc_upload(struct net_device *dev)
 
 int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
 {
-	int err = 0;
-	struct dev_mc_list *dmi, **dmip;
+	int err;
 
 	netif_tx_lock_bh(dev);
+	err = __dev_addr_delete(&dev->mc_list, addr, alen, glbl);
+	if (!err) {
+		dev->mc_count--;
 
-	for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) {
 		/*
-		 *	Find the entry we want to delete. The device could
-		 *	have variable length entries so check these too.
+		 *	We have altered the list, so the card
+		 *	loaded filter is now wrong. Fix it
 		 */
-		if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 &&
-		    alen == dmi->dmi_addrlen) {
-			if (glbl) {
-				int old_glbl = dmi->dmi_gusers;
-				dmi->dmi_gusers = 0;
-				if (old_glbl == 0)
-					break;
-			}
-			if (--dmi->dmi_users)
-				goto done;
-
-			/*
-			 *	Last user. So delete the entry.
-			 */
-			*dmip = dmi->next;
-			dev->mc_count--;
-
-			kfree(dmi);
-
-			/*
-			 *	We have altered the list, so the card
-			 *	loaded filter is now wrong. Fix it
-			 */
-			__dev_mc_upload(dev);
-
-			netif_tx_unlock_bh(dev);
-			return 0;
-		}
+
+		__dev_mc_upload(dev);
 	}
-	err = -ENOENT;
-done:
 	netif_tx_unlock_bh(dev);
 	return err;
 }
@@ -153,46 +126,15 @@ done:
 
 int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
 {
-	int err = 0;
-	struct dev_mc_list *dmi, *dmi1;
-
-	dmi1 = kmalloc(sizeof(*dmi), GFP_ATOMIC);
+	int err;
 
 	netif_tx_lock_bh(dev);
-	for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) {
-		if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 &&
-		    dmi->dmi_addrlen == alen) {
-			if (glbl) {
-				int old_glbl = dmi->dmi_gusers;
-				dmi->dmi_gusers = 1;
-				if (old_glbl)
-					goto done;
-			}
-			dmi->dmi_users++;
-			goto done;
-		}
-	}
-
-	if ((dmi = dmi1) == NULL) {
-		netif_tx_unlock_bh(dev);
-		return -ENOMEM;
+	err = __dev_addr_add(&dev->mc_list, addr, alen, glbl);
+	if (!err) {
+		dev->mc_count++;
+		__dev_mc_upload(dev);
 	}
-	memcpy(dmi->dmi_addr, addr, alen);
-	dmi->dmi_addrlen = alen;
-	dmi->next = dev->mc_list;
-	dmi->dmi_users = 1;
-	dmi->dmi_gusers = glbl ? 1 : 0;
-	dev->mc_list = dmi;
-	dev->mc_count++;
-
-	__dev_mc_upload(dev);
-
 	netif_tx_unlock_bh(dev);
-	return 0;
-
-done:
-	netif_tx_unlock_bh(dev);
-	kfree(dmi1);
 	return err;
 }
 
@@ -203,16 +145,8 @@ done:
 void dev_mc_discard(struct net_device *dev)
 {
 	netif_tx_lock_bh(dev);
-
-	while (dev->mc_list != NULL) {
-		struct dev_mc_list *tmp = dev->mc_list;
-		dev->mc_list = tmp->next;
-		if (tmp->dmi_users > tmp->dmi_gusers)
-			printk("dev_mc_discard: multicast leakage! dmi_users=%d\n", tmp->dmi_users);
-		kfree(tmp);
-	}
+	__dev_addr_discard(&dev->mc_list);
 	dev->mc_count = 0;
-
 	netif_tx_unlock_bh(dev);
 }
 
@@ -244,7 +178,7 @@ static void dev_mc_seq_stop(struct seq_file *seq, void *v)
 
 static int dev_mc_seq_show(struct seq_file *seq, void *v)
 {
-	struct dev_mc_list *m;
+	struct dev_addr_list *m;
 	struct net_device *dev = v;
 
 	netif_tx_lock_bh(dev);
-- 
cgit v1.3-8-gc7d7


From 4417da668c0021903464f92db278ddae348e0299 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 27 Jun 2007 01:28:10 -0700
Subject: [NET]: dev: secondary unicast address support

Add support for configuring secondary unicast addresses on network
devices. To support this devices capable of filtering multiple
unicast addresses need to change their set_multicast_list function
to configure unicast filters as well and assign it to dev->set_rx_mode
instead of dev->set_multicast_list. Other devices are put into promiscous
mode when secondary unicast addresses are present.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  12 +++-
 net/core/dev.c            | 144 ++++++++++++++++++++++++++++++++++++++++------
 net/core/dev_mcast.c      |  37 +-----------
 3 files changed, 139 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9e114e77e54d..2c0cc19edfb2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -397,6 +397,9 @@ struct net_device
 	unsigned char		addr_len;	/* hardware address length	*/
 	unsigned short          dev_id;		/* for shared network cards */
 
+	struct dev_addr_list	*uc_list;	/* Secondary unicast mac addresses */
+	int			uc_count;	/* Number of installed ucasts	*/
+	int			uc_promisc;
 	struct dev_addr_list	*mc_list;	/* Multicast mac addresses	*/
 	int			mc_count;	/* Number of installed mcasts	*/
 	int			promiscuity;
@@ -502,6 +505,8 @@ struct net_device
 						void *saddr,
 						unsigned len);
 	int			(*rebuild_header)(struct sk_buff *skb);
+#define HAVE_SET_RX_MODE
+	void			(*set_rx_mode)(struct net_device *dev);
 #define HAVE_MULTICAST			 
 	void			(*set_multicast_list)(struct net_device *dev);
 #define HAVE_SET_MAC_ADDR  		 
@@ -1008,8 +1013,11 @@ extern struct net_device *alloc_netdev(int sizeof_priv, const char *name,
 				       void (*setup)(struct net_device *));
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
-/* Functions used for multicast support */
-extern void		dev_mc_upload(struct net_device *dev);
+/* Functions used for secondary unicast and multicast support */
+extern void		dev_set_rx_mode(struct net_device *dev);
+extern void		__dev_set_rx_mode(struct net_device *dev);
+extern int		dev_unicast_delete(struct net_device *dev, void *addr, int alen);
+extern int		dev_unicast_add(struct net_device *dev, void *addr, int alen);
 extern int 		dev_mc_delete(struct net_device *dev, void *addr, int alen, int all);
 extern int		dev_mc_add(struct net_device *dev, void *addr, int alen, int newonly);
 extern void		dev_mc_discard(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 18759ccdf219..36e9bf8ec4af 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -942,7 +942,7 @@ int dev_open(struct net_device *dev)
 		/*
 		 *	Initialize multicasting status
 		 */
-		dev_mc_upload(dev);
+		dev_set_rx_mode(dev);
 
 		/*
 		 *	Wakeup transmit queue engine
@@ -2498,17 +2498,7 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)
 	return 0;
 }
 
-/**
- *	dev_set_promiscuity	- update promiscuity count on a device
- *	@dev: device
- *	@inc: modifier
- *
- *	Add or remove promiscuity from a device. While the count in the device
- *	remains above zero the interface remains promiscuous. Once it hits zero
- *	the device reverts back to normal filtering operation. A negative inc
- *	value is used to drop promiscuity on the device.
- */
-void dev_set_promiscuity(struct net_device *dev, int inc)
+static void __dev_set_promiscuity(struct net_device *dev, int inc)
 {
 	unsigned short old_flags = dev->flags;
 
@@ -2517,7 +2507,6 @@ void dev_set_promiscuity(struct net_device *dev, int inc)
 	else
 		dev->flags |= IFF_PROMISC;
 	if (dev->flags != old_flags) {
-		dev_mc_upload(dev);
 		printk(KERN_INFO "device %s %s promiscuous mode\n",
 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
 							       "left");
@@ -2530,6 +2519,25 @@ void dev_set_promiscuity(struct net_device *dev, int inc)
 	}
 }
 
+/**
+ *	dev_set_promiscuity	- update promiscuity count on a device
+ *	@dev: device
+ *	@inc: modifier
+ *
+ *	Add or remove promiscuity from a device. While the count in the device
+ *	remains above zero the interface remains promiscuous. Once it hits zero
+ *	the device reverts back to normal filtering operation. A negative inc
+ *	value is used to drop promiscuity on the device.
+ */
+void dev_set_promiscuity(struct net_device *dev, int inc)
+{
+	unsigned short old_flags = dev->flags;
+
+	__dev_set_promiscuity(dev, inc);
+	if (dev->flags != old_flags)
+		dev_set_rx_mode(dev);
+}
+
 /**
  *	dev_set_allmulti	- update allmulti count on a device
  *	@dev: device
@@ -2550,7 +2558,48 @@ void dev_set_allmulti(struct net_device *dev, int inc)
 	if ((dev->allmulti += inc) == 0)
 		dev->flags &= ~IFF_ALLMULTI;
 	if (dev->flags ^ old_flags)
-		dev_mc_upload(dev);
+		dev_set_rx_mode(dev);
+}
+
+/*
+ *	Upload unicast and multicast address lists to device and
+ *	configure RX filtering. When the device doesn't support unicast
+ *	filtering it is put in promiscous mode while unicast addresses
+ *	are present.
+ */
+void __dev_set_rx_mode(struct net_device *dev)
+{
+	/* dev_open will call this function so the list will stay sane. */
+	if (!(dev->flags&IFF_UP))
+		return;
+
+	if (!netif_device_present(dev))
+	        return;
+
+	if (dev->set_rx_mode)
+		dev->set_rx_mode(dev);
+	else {
+		/* Unicast addresses changes may only happen under the rtnl,
+		 * therefore calling __dev_set_promiscuity here is safe.
+		 */
+		if (dev->uc_count > 0 && !dev->uc_promisc) {
+			__dev_set_promiscuity(dev, 1);
+			dev->uc_promisc = 1;
+		} else if (dev->uc_count == 0 && dev->uc_promisc) {
+			__dev_set_promiscuity(dev, -1);
+			dev->uc_promisc = 0;
+		}
+
+		if (dev->set_multicast_list)
+			dev->set_multicast_list(dev);
+	}
+}
+
+void dev_set_rx_mode(struct net_device *dev)
+{
+	netif_tx_lock_bh(dev);
+	__dev_set_rx_mode(dev);
+	netif_tx_unlock_bh(dev);
 }
 
 int __dev_addr_delete(struct dev_addr_list **list, void *addr, int alen,
@@ -2622,6 +2671,66 @@ void __dev_addr_discard(struct dev_addr_list **list)
 	}
 }
 
+/**
+ *	dev_unicast_delete	- Release secondary unicast address.
+ *	@dev: device
+ *
+ *	Release reference to a secondary unicast address and remove it
+ *	from the device if the reference count drop to zero.
+ *
+ * 	The caller must hold the rtnl_mutex.
+ */
+int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	netif_tx_lock_bh(dev);
+	err = __dev_addr_delete(&dev->uc_list, addr, alen, 0);
+	if (!err) {
+		dev->uc_count--;
+		__dev_set_rx_mode(dev);
+	}
+	netif_tx_unlock_bh(dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_unicast_delete);
+
+/**
+ *	dev_unicast_add		- add a secondary unicast address
+ *	@dev: device
+ *
+ *	Add a secondary unicast address to the device or increase
+ *	the reference count if it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_unicast_add(struct net_device *dev, void *addr, int alen)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	netif_tx_lock_bh(dev);
+	err = __dev_addr_add(&dev->uc_list, addr, alen, 0);
+	if (!err) {
+		dev->uc_count++;
+		__dev_set_rx_mode(dev);
+	}
+	netif_tx_unlock_bh(dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_unicast_add);
+
+static void dev_unicast_discard(struct net_device *dev)
+{
+	netif_tx_lock_bh(dev);
+	__dev_addr_discard(&dev->uc_list);
+	dev->uc_count = 0;
+	netif_tx_unlock_bh(dev);
+}
+
 unsigned dev_get_flags(const struct net_device *dev)
 {
 	unsigned flags;
@@ -2665,7 +2774,7 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
 	 *	Load in the correct multicast list now the flags have changed.
 	 */
 
-	dev_mc_upload(dev);
+	dev_set_rx_mode(dev);
 
 	/*
 	 *	Have we downed the interface. We handle IFF_UP ourselves
@@ -2678,7 +2787,7 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
 		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
 
 		if (!ret)
-			dev_mc_upload(dev);
+			dev_set_rx_mode(dev);
 	}
 
 	if (dev->flags & IFF_UP &&
@@ -3558,8 +3667,9 @@ void unregister_netdevice(struct net_device *dev)
 	raw_notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
 
 	/*
-	 *	Flush the multicast chain
+	 *	Flush the unicast and multicast chains
 	 */
+	dev_unicast_discard(dev);
 	dev_mc_discard(dev);
 
 	if (dev->uninit)
diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
index 702907434a47..5cc9b448c443 100644
--- a/net/core/dev_mcast.c
+++ b/net/core/dev_mcast.c
@@ -63,39 +63,6 @@
  *	We block accesses to device mc filters with netif_tx_lock.
  */
 
-/*
- *	Update the multicast list into the physical NIC controller.
- */
-
-static void __dev_mc_upload(struct net_device *dev)
-{
-	/* Don't do anything till we up the interface
-	 * [dev_open will call this function so the list will
-	 * stay sane]
-	 */
-
-	if (!(dev->flags&IFF_UP))
-		return;
-
-	/*
-	 *	Devices with no set multicast or which have been
-	 *	detached don't get set.
-	 */
-
-	if (dev->set_multicast_list == NULL ||
-	    !netif_device_present(dev))
-		return;
-
-	dev->set_multicast_list(dev);
-}
-
-void dev_mc_upload(struct net_device *dev)
-{
-	netif_tx_lock_bh(dev);
-	__dev_mc_upload(dev);
-	netif_tx_unlock_bh(dev);
-}
-
 /*
  *	Delete a device level multicast
  */
@@ -114,7 +81,7 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
 		 *	loaded filter is now wrong. Fix it
 		 */
 
-		__dev_mc_upload(dev);
+		__dev_set_rx_mode(dev);
 	}
 	netif_tx_unlock_bh(dev);
 	return err;
@@ -132,7 +99,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
 	err = __dev_addr_add(&dev->mc_list, addr, alen, glbl);
 	if (!err) {
 		dev->mc_count++;
-		__dev_mc_upload(dev);
+		__dev_set_rx_mode(dev);
 	}
 	netif_tx_unlock_bh(dev);
 	return err;
-- 
cgit v1.3-8-gc7d7


From 342f0234c71b40da785dd6a7ce1dd481ecbfdb81 Mon Sep 17 00:00:00 2001
From: James Chapman <jchapman@katalix.com>
Date: Wed, 27 Jun 2007 15:37:46 -0700
Subject: [UDP]: Introduce UDP encapsulation type for L2TP

This patch adds a new UDP_ENCAP_L2TPINUDP encapsulation type for UDP
sockets. When a UDP socket's encap_type is UDP_ENCAP_L2TPINUDP, the
skb is delivered to a function pointed to by the udp_sock's
encap_rcv funcptr. If the skb isn't wanted by L2TP, it returns >0, which
causes it to be passed through to UDP.

Include padding to put the new encap_rcv field on a 4-byte boundary.

Previously, the only user of UDP encap sockets was ESP, so when
CONFIG_XFRM was not defined, some of the encap code was compiled
out. This patch changes that. As a result, udp_encap_rcv() will
now do a little more work when CONFIG_XFRM is not defined.

Signed-off-by: James Chapman <jchapman@katalix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h |  6 ++++++
 net/ipv4/udp.c      | 27 +++++++++++++++++++++++----
 2 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 6de445c31a64..8ec703f462da 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -42,6 +42,7 @@ static inline struct udphdr *udp_hdr(const struct sk_buff *skb)
 /* UDP encapsulation types */
 #define UDP_ENCAP_ESPINUDP_NON_IKE	1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
 #define UDP_ENCAP_ESPINUDP	2 /* draft-ietf-ipsec-udp-encaps-06 */
+#define UDP_ENCAP_L2TPINUDP	3 /* rfc2661 */
 
 #ifdef __KERNEL__
 #include <linux/types.h>
@@ -70,6 +71,11 @@ struct udp_sock {
 #define UDPLITE_SEND_CC  0x2  		/* set via udplite setsockopt         */
 #define UDPLITE_RECV_CC  0x4		/* set via udplite setsocktopt        */
 	__u8		 pcflag;        /* marks socket as UDP-Lite if > 0    */
+	__u8		 unused[3];
+	/*
+	 * For encapsulation sockets.
+	 */
+	int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
 };
 
 static inline struct udp_sock *udp_sk(const struct sock *sk)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index facb7e29304e..b9276f8bdac5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -70,6 +70,7 @@
  *	Alexey Kuznetsov:		allow both IPv4 and IPv6 sockets to bind
  *					a single port at the same time.
  *	Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
+ *	James Chapman		:	Add L2TP encapsulation type.
  *
  *
  *		This program is free software; you can redistribute it and/or
@@ -923,12 +924,10 @@ int udp_disconnect(struct sock *sk, int flags)
  * 	1  if the UDP system should process it
  *	0  if we should drop this packet
  * 	-1 if it should get processed by xfrm4_rcv_encap
+ *	-2 if it should get processed by l2tp
  */
 static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb)
 {
-#ifndef CONFIG_XFRM
-	return 1;
-#else
 	struct udp_sock *up = udp_sk(sk);
 	struct udphdr *uh;
 	struct iphdr *iph;
@@ -983,8 +982,14 @@ static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb)
 			/* Must be an IKE packet.. pass it through */
 			return 1;
 		break;
+	case UDP_ENCAP_L2TPINUDP:
+		/* Let caller know to send this to l2tp */
+		return -2;
 	}
 
+#ifndef CONFIG_XFRM
+	return 1;
+#else
 	/* At this point we are sure that this is an ESPinUDP packet,
 	 * so we need to remove 'len' bytes from the packet (the UDP
 	 * header and optional ESP marker bytes) and then modify the
@@ -1055,12 +1060,25 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
 			kfree_skb(skb);
 			return 0;
 		}
-		if (ret < 0) {
+		if (ret == -1) {
 			/* process the ESP packet */
 			ret = xfrm4_rcv_encap(skb, up->encap_type);
 			UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag);
 			return -ret;
 		}
+		if (ret == -2) {
+			/* process the L2TP packet */
+			if (up->encap_rcv != NULL) {
+				ret = (*up->encap_rcv)(sk, skb);
+				if (ret <= 0) {
+					UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag);
+					return ret;
+				}
+
+				/* FALLTHROUGH -- pass up as UDP packet */
+			}
+		}
+
 		/* FALLTHROUGH -- it's a UDP Packet */
 	}
 
@@ -1349,6 +1367,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		case 0:
 		case UDP_ENCAP_ESPINUDP:
 		case UDP_ENCAP_ESPINUDP_NON_IKE:
+		case UDP_ENCAP_L2TPINUDP:
 			up->encap_type = val;
 			break;
 		default:
-- 
cgit v1.3-8-gc7d7


From cf14a4d06742d59ecb2d837a3f53bb24d1ff9acb Mon Sep 17 00:00:00 2001
From: James Chapman <jchapman@katalix.com>
Date: Wed, 27 Jun 2007 15:43:43 -0700
Subject: [L2TP]: Changes to existing ppp and socket kernel headers for L2TP

Add struct sockaddr_pppol2tp to carry L2TP-specific address
information for the PPPoX (PPPoL2TP) socket. Unfortunately we can't
use the union inside struct sockaddr_pppox because the L2TP-specific
data is larger than the current size of the union and we must preserve
the size of struct sockaddr_pppox for binary compatibility.

Also add a PPPIOCGL2TPSTATS ioctl to allow userspace to obtain
L2TP counters and state from the kernel.

Add new if_pppol2tp.h header.

[ Modified to use aligned_u64 in statistics structure -DaveM ]

Signed-off-by: James Chapman <jchapman@katalix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/Kbuild        |  1 +
 include/linux/if_ppp.h      | 16 +++++++++++
 include/linux/if_pppol2tp.h | 69 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/if_pppox.h    | 16 +++++++++--
 include/linux/socket.h      |  1 +
 5 files changed, 101 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/if_pppol2tp.h

(limited to 'include/linux')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index d94451682761..127d2d192b5a 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -225,6 +225,7 @@ unifdef-y += if_fddi.h
 unifdef-y += if_frad.h
 unifdef-y += if_ltalk.h
 unifdef-y += if_link.h
+unifdef-y += if_pppol2tp.h
 unifdef-y += if_pppox.h
 unifdef-y += if_shaper.h
 unifdef-y += if_tr.h
diff --git a/include/linux/if_ppp.h b/include/linux/if_ppp.h
index 768372f07caa..0f2f70d4e48c 100644
--- a/include/linux/if_ppp.h
+++ b/include/linux/if_ppp.h
@@ -110,6 +110,21 @@ struct ifpppcstatsreq {
 	struct ppp_comp_stats stats;
 };
 
+/* For PPPIOCGL2TPSTATS */
+struct pppol2tp_ioc_stats {
+	__u16		tunnel_id;	/* redundant */
+	__u16		session_id;	/* if zero, get tunnel stats */
+	__u32		using_ipsec:1;	/* valid only for session_id == 0 */
+	aligned_u64	tx_packets;
+	aligned_u64	tx_bytes;
+	aligned_u64	tx_errors;
+	aligned_u64	rx_packets;
+	aligned_u64	rx_bytes;
+	aligned_u64	rx_seq_discards;
+	aligned_u64	rx_oos_packets;
+	aligned_u64	rx_errors;
+};
+
 #define ifr__name       b.ifr_ifrn.ifrn_name
 #define stats_ptr       b.ifr_ifru.ifru_data
 
@@ -146,6 +161,7 @@ struct ifpppcstatsreq {
 #define PPPIOCDISCONN	_IO('t', 57)		/* disconnect channel */
 #define PPPIOCATTCHAN	_IOW('t', 56, int)	/* attach to ppp channel */
 #define PPPIOCGCHAN	_IOR('t', 55, int)	/* get ppp channel number */
+#define PPPIOCGL2TPSTATS _IOR('t', 54, struct pppol2tp_ioc_stats)
 
 #define SIOCGPPPSTATS   (SIOCDEVPRIVATE + 0)
 #define SIOCGPPPVER     (SIOCDEVPRIVATE + 1)	/* NEVER change this!! */
diff --git a/include/linux/if_pppol2tp.h b/include/linux/if_pppol2tp.h
new file mode 100644
index 000000000000..516203b6fdeb
--- /dev/null
+++ b/include/linux/if_pppol2tp.h
@@ -0,0 +1,69 @@
+/***************************************************************************
+ * Linux PPP over L2TP (PPPoL2TP) Socket Implementation (RFC 2661)
+ *
+ * This file supplies definitions required by the PPP over L2TP driver
+ * (pppol2tp.c).  All version information wrt this file is located in pppol2tp.c
+ *
+ * License:
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef __LINUX_IF_PPPOL2TP_H
+#define __LINUX_IF_PPPOL2TP_H
+
+#include <asm/types.h>
+
+#ifdef __KERNEL__
+#include <linux/in.h>
+#endif
+
+/* Structure used to connect() the socket to a particular tunnel UDP
+ * socket.
+ */
+struct pppol2tp_addr
+{
+	pid_t	pid;			/* pid that owns the fd.
+					 * 0 => current */
+	int	fd;			/* FD of UDP socket to use */
+
+	struct sockaddr_in addr;	/* IP address and port to send to */
+
+	__be16 s_tunnel, s_session;	/* For matching incoming packets */
+	__be16 d_tunnel, d_session;	/* For sending outgoing packets */
+};
+
+/* Socket options:
+ * DEBUG	- bitmask of debug message categories
+ * SENDSEQ	- 0 => don't send packets with sequence numbers
+ *		  1 => send packets with sequence numbers
+ * RECVSEQ	- 0 => receive packet sequence numbers are optional
+ *		  1 => drop receive packets without sequence numbers
+ * LNSMODE	- 0 => act as LAC.
+ *		  1 => act as LNS.
+ * REORDERTO	- reorder timeout (in millisecs). If 0, don't try to reorder.
+ */
+enum {
+	PPPOL2TP_SO_DEBUG	= 1,
+	PPPOL2TP_SO_RECVSEQ	= 2,
+	PPPOL2TP_SO_SENDSEQ	= 3,
+	PPPOL2TP_SO_LNSMODE	= 4,
+	PPPOL2TP_SO_REORDERTO	= 5,
+};
+
+/* Debug message categories for the DEBUG socket option */
+enum {
+	PPPOL2TP_MSG_DEBUG	= (1 << 0),	/* verbose debug (if
+						 * compiled in) */
+	PPPOL2TP_MSG_CONTROL	= (1 << 1),	/* userspace - kernel
+						 * interface */
+	PPPOL2TP_MSG_SEQ	= (1 << 2),	/* sequence numbers */
+	PPPOL2TP_MSG_DATA	= (1 << 3),	/* data packets */
+};
+
+
+
+#endif
diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h
index 6f987be60fe2..25652545ba6e 100644
--- a/include/linux/if_pppox.h
+++ b/include/linux/if_pppox.h
@@ -27,6 +27,7 @@
 #include <asm/semaphore.h>
 #include <linux/ppp_channel.h>
 #endif /* __KERNEL__ */
+#include <linux/if_pppol2tp.h>
 
 /* For user-space programs to pick up these definitions
  * which they wouldn't get otherwise without defining __KERNEL__
@@ -50,8 +51,9 @@ struct pppoe_addr{
  * Protocols supported by AF_PPPOX 
  */ 
 #define PX_PROTO_OE    0 /* Currently just PPPoE */
-#define PX_MAX_PROTO   1	
- 
+#define PX_PROTO_OL2TP 1 /* Now L2TP also */
+#define PX_MAX_PROTO   2
+
 struct sockaddr_pppox { 
        sa_family_t     sa_family;            /* address family, AF_PPPOX */ 
        unsigned int    sa_protocol;          /* protocol identifier */ 
@@ -60,6 +62,16 @@ struct sockaddr_pppox {
        }sa_addr; 
 }__attribute__ ((packed)); 
 
+/* The use of the above union isn't viable because the size of this
+ * struct must stay fixed over time -- applications use sizeof(struct
+ * sockaddr_pppox) to fill it. We use a protocol specific sockaddr
+ * type instead.
+ */
+struct sockaddr_pppol2tp {
+	sa_family_t     sa_family;      /* address family, AF_PPPOX */
+	unsigned int    sa_protocol;    /* protocol identifier */
+	struct pppol2tp_addr pppol2tp;
+}__attribute__ ((packed));
 
 /*********************************************************************
  *
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 6e7c9483a6a6..fe195c97a89d 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -287,6 +287,7 @@ struct ucred {
 #define SOL_NETLINK	270
 #define SOL_TIPC	271
 #define SOL_RXRPC	272
+#define SOL_PPPOL2TP	273
 
 /* IPX options */
 #define IPX_TYPE	1
-- 
cgit v1.3-8-gc7d7


From f25f4e44808f0f6c9875d94ef1c41ef86c288eb2 Mon Sep 17 00:00:00 2001
From: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Date: Fri, 6 Jul 2007 13:36:20 -0700
Subject: [CORE] Stack changes to add multiqueue hardware support API

Add the multiqueue hardware device support API to the core network
stack.  Allow drivers to allocate multiple queues and manage them at
the netdev level if they choose to do so.

Added a new field to sk_buff, namely queue_mapping, for drivers to
know which tx_ring to select based on OS classification of the flow.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/Kconfig         |  8 +++++
 include/linux/etherdevice.h |  3 +-
 include/linux/netdevice.h   | 80 ++++++++++++++++++++++++++++++++++++++++++---
 include/linux/skbuff.h      | 25 ++++++++++++--
 net/core/dev.c              | 36 ++++++++++++++------
 net/core/netpoll.c          |  8 +++--
 net/core/pktgen.c           | 10 ++++--
 net/core/skbuff.c           |  3 ++
 net/ethernet/eth.c          |  9 ++---
 net/sched/sch_teql.c        |  6 +++-
 10 files changed, 158 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index c251cca295c1..d4e39ff1545b 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -25,6 +25,14 @@ menuconfig NETDEVICES
 # that for each of the symbols.
 if NETDEVICES
 
+config NETDEVICES_MULTIQUEUE
+	bool "Netdevice multiple hardware queue support"
+	---help---
+	  Say Y here if you want to allow the network stack to use multiple
+	  hardware TX queues on an ethernet device.
+
+	  Most people will say N here.
+
 config IFB
 	tristate "Intermediate Functional Block support"
 	depends on NET_CLS_ACT
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index f48eb89efd0f..6cdb97365e47 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -39,7 +39,8 @@ extern void		eth_header_cache_update(struct hh_cache *hh, struct net_device *dev
 extern int		eth_header_cache(struct neighbour *neigh,
 					 struct hh_cache *hh);
 
-extern struct net_device *alloc_etherdev(int sizeof_priv);
+extern struct net_device *alloc_etherdev_mq(int sizeof_priv, unsigned int queue_count);
+#define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1)
 
 /**
  * is_zero_ether_addr - Determine if give Ethernet address is all zeros.
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2c0cc19edfb2..9817821729c4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -108,6 +108,14 @@ struct wireless_dev;
 #define MAX_HEADER (LL_MAX_HEADER + 48)
 #endif
 
+struct net_device_subqueue
+{
+	/* Give a control state for each queue.  This struct may contain
+	 * per-queue locks in the future.
+	 */
+	unsigned long   state;
+};
+
 /*
  *	Network device statistics. Akin to the 2.0 ether stats but
  *	with byte counters.
@@ -331,6 +339,7 @@ struct net_device
 #define NETIF_F_VLAN_CHALLENGED	1024	/* Device cannot handle VLAN packets */
 #define NETIF_F_GSO		2048	/* Enable software GSO. */
 #define NETIF_F_LLTX		4096	/* LockLess TX */
+#define NETIF_F_MULTI_QUEUE	16384	/* Has multiple TX/RX queues */
 
 	/* Segmentation offload features */
 #define NETIF_F_GSO_SHIFT	16
@@ -557,6 +566,10 @@ struct net_device
 
 	/* rtnetlink link ops */
 	const struct rtnl_link_ops *rtnl_link_ops;
+
+	/* The TX queue control structures */
+	unsigned int			egress_subqueue_count;
+	struct net_device_subqueue	egress_subqueue[0];
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
@@ -565,9 +578,7 @@ struct net_device
 
 static inline void *netdev_priv(const struct net_device *dev)
 {
-	return (char *)dev + ((sizeof(struct net_device)
-					+ NETDEV_ALIGN_CONST)
-				& ~NETDEV_ALIGN_CONST);
+	return dev->priv;
 }
 
 #define SET_MODULE_OWNER(dev) do { } while (0)
@@ -719,6 +730,62 @@ static inline int netif_running(const struct net_device *dev)
 	return test_bit(__LINK_STATE_START, &dev->state);
 }
 
+/*
+ * Routines to manage the subqueues on a device.  We only need start
+ * stop, and a check if it's stopped.  All other device management is
+ * done at the overall netdevice level.
+ * Also test the device if we're multiqueue.
+ */
+static inline void netif_start_subqueue(struct net_device *dev, u16 queue_index)
+{
+#ifdef CONFIG_NETDEVICES_MULTIQUEUE
+	clear_bit(__LINK_STATE_XOFF, &dev->egress_subqueue[queue_index].state);
+#endif
+}
+
+static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index)
+{
+#ifdef CONFIG_NETDEVICES_MULTIQUEUE
+#ifdef CONFIG_NETPOLL_TRAP
+	if (netpoll_trap())
+		return;
+#endif
+	set_bit(__LINK_STATE_XOFF, &dev->egress_subqueue[queue_index].state);
+#endif
+}
+
+static inline int netif_subqueue_stopped(const struct net_device *dev,
+					 u16 queue_index)
+{
+#ifdef CONFIG_NETDEVICES_MULTIQUEUE
+	return test_bit(__LINK_STATE_XOFF,
+			&dev->egress_subqueue[queue_index].state);
+#else
+	return 0;
+#endif
+}
+
+static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
+{
+#ifdef CONFIG_NETDEVICES_MULTIQUEUE
+#ifdef CONFIG_NETPOLL_TRAP
+	if (netpoll_trap())
+		return;
+#endif
+	if (test_and_clear_bit(__LINK_STATE_XOFF,
+			       &dev->egress_subqueue[queue_index].state))
+		__netif_schedule(dev);
+#endif
+}
+
+static inline int netif_is_multiqueue(const struct net_device *dev)
+{
+#ifdef CONFIG_NETDEVICES_MULTIQUEUE
+	return (!!(NETIF_F_MULTI_QUEUE & dev->features));
+#else
+	return 0;
+#endif
+}
 
 /* Use this variant when it is known for sure that it
  * is executing from interrupt context.
@@ -1009,8 +1076,11 @@ static inline void netif_tx_disable(struct net_device *dev)
 extern void		ether_setup(struct net_device *dev);
 
 /* Support for loadable net-drivers */
-extern struct net_device *alloc_netdev(int sizeof_priv, const char *name,
-				       void (*setup)(struct net_device *));
+extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
+				       void (*setup)(struct net_device *),
+				       unsigned int queue_count);
+#define alloc_netdev(sizeof_priv, name, setup) \
+	alloc_netdev_mq(sizeof_priv, name, setup, 1)
 extern int		register_netdev(struct net_device *dev);
 extern void		unregister_netdev(struct net_device *dev);
 /* Functions used for secondary unicast and multicast support */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 881fe80f01d0..2d6a14f5f2f1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -196,7 +196,6 @@ typedef unsigned char *sk_buff_data_t;
  *	@sk: Socket we are owned by
  *	@tstamp: Time we arrived
  *	@dev: Device we arrived on/are leaving by
- *	@iif: ifindex of device we arrived on
  *	@transport_header: Transport layer header
  *	@network_header: Network layer header
  *	@mac_header: Link layer header
@@ -231,6 +230,8 @@ typedef unsigned char *sk_buff_data_t;
  *	@nfctinfo: Relationship of this skb to the connection
  *	@nfct_reasm: netfilter conntrack re-assembly pointer
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
+ *	@iif: ifindex of device we arrived on
+ *	@queue_mapping: Queue mapping for multiqueue devices
  *	@tc_index: Traffic control index
  *	@tc_verd: traffic control verdict
  *	@dma_cookie: a cookie to one of several possible DMA operations
@@ -246,8 +247,6 @@ struct sk_buff {
 	struct sock		*sk;
 	ktime_t			tstamp;
 	struct net_device	*dev;
-	int			iif;
-	/* 4 byte hole on 64 bit*/
 
 	struct  dst_entry	*dst;
 	struct	sec_path	*sp;
@@ -290,12 +289,18 @@ struct sk_buff {
 #ifdef CONFIG_BRIDGE_NETFILTER
 	struct nf_bridge_info	*nf_bridge;
 #endif
+
+	int			iif;
+	__u16			queue_mapping;
+
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
 #ifdef CONFIG_NET_CLS_ACT
 	__u16			tc_verd;	/* traffic control verdict */
 #endif
 #endif
+	/* 2 byte hole */
+
 #ifdef CONFIG_NET_DMA
 	dma_cookie_t		dma_cookie;
 #endif
@@ -1725,6 +1730,20 @@ static inline void skb_init_secmark(struct sk_buff *skb)
 { }
 #endif
 
+static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping)
+{
+#ifdef CONFIG_NETDEVICES_MULTIQUEUE
+	skb->queue_mapping = queue_mapping;
+#endif
+}
+
+static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_buff *from)
+{
+#ifdef CONFIG_NETDEVICES_MULTIQUEUE
+	to->queue_mapping = from->queue_mapping;
+#endif
+}
+
 static inline int skb_is_gso(const struct sk_buff *skb)
 {
 	return skb_shinfo(skb)->gso_size;
diff --git a/net/core/dev.c b/net/core/dev.c
index 6dce9d2d46f2..7ddf66d0ad5e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1429,7 +1429,9 @@ gso:
 			skb->next = nskb;
 			return rc;
 		}
-		if (unlikely(netif_queue_stopped(dev) && skb->next))
+		if (unlikely((netif_queue_stopped(dev) ||
+			     netif_subqueue_stopped(dev, skb->queue_mapping)) &&
+			     skb->next))
 			return NETDEV_TX_BUSY;
 	} while (skb->next);
 
@@ -1547,6 +1549,8 @@ gso:
 		spin_lock(&dev->queue_lock);
 		q = dev->qdisc;
 		if (q->enqueue) {
+			/* reset queue_mapping to zero */
+			skb->queue_mapping = 0;
 			rc = q->enqueue(skb, q);
 			qdisc_run(dev);
 			spin_unlock(&dev->queue_lock);
@@ -1576,7 +1580,8 @@ gso:
 
 			HARD_TX_LOCK(dev, cpu);
 
-			if (!netif_queue_stopped(dev)) {
+			if (!netif_queue_stopped(dev) &&
+			    !netif_subqueue_stopped(dev, skb->queue_mapping)) {
 				rc = 0;
 				if (!dev_hard_start_xmit(skb, dev)) {
 					HARD_TX_UNLOCK(dev);
@@ -3539,16 +3544,18 @@ static struct net_device_stats *internal_stats(struct net_device *dev)
 }
 
 /**
- *	alloc_netdev - allocate network device
+ *	alloc_netdev_mq - allocate network device
  *	@sizeof_priv:	size of private data to allocate space for
  *	@name:		device name format string
  *	@setup:		callback to initialize device
+ *	@queue_count:	the number of subqueues to allocate
  *
  *	Allocates a struct net_device with private data area for driver use
- *	and performs basic initialization.
+ *	and performs basic initialization.  Also allocates subquue structs
+ *	for each queue on the device at the end of the netdevice.
  */
-struct net_device *alloc_netdev(int sizeof_priv, const char *name,
-		void (*setup)(struct net_device *))
+struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
+		void (*setup)(struct net_device *), unsigned int queue_count)
 {
 	void *p;
 	struct net_device *dev;
@@ -3557,7 +3564,9 @@ struct net_device *alloc_netdev(int sizeof_priv, const char *name,
 	BUG_ON(strlen(name) >= sizeof(dev->name));
 
 	/* ensure 32-byte alignment of both the device and private area */
-	alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
+	alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST +
+		     (sizeof(struct net_device_subqueue) * queue_count)) &
+		     ~NETDEV_ALIGN_CONST;
 	alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
 
 	p = kzalloc(alloc_size, GFP_KERNEL);
@@ -3570,15 +3579,22 @@ struct net_device *alloc_netdev(int sizeof_priv, const char *name,
 		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
 	dev->padded = (char *)dev - (char *)p;
 
-	if (sizeof_priv)
-		dev->priv = netdev_priv(dev);
+	if (sizeof_priv) {
+		dev->priv = ((char *)dev +
+			     ((sizeof(struct net_device) +
+			       (sizeof(struct net_device_subqueue) *
+				queue_count) + NETDEV_ALIGN_CONST)
+			      & ~NETDEV_ALIGN_CONST));
+	}
+
+	dev->egress_subqueue_count = queue_count;
 
 	dev->get_stats = internal_stats;
 	setup(dev);
 	strcpy(dev->name, name);
 	return dev;
 }
-EXPORT_SYMBOL(alloc_netdev);
+EXPORT_SYMBOL(alloc_netdev_mq);
 
 /**
  *	free_netdev - free network device
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a0efdd7a6b37..4b06d1936375 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -66,8 +66,9 @@ static void queue_process(struct work_struct *work)
 
 		local_irq_save(flags);
 		netif_tx_lock(dev);
-		if (netif_queue_stopped(dev) ||
-		    dev->hard_start_xmit(skb, dev) != NETDEV_TX_OK) {
+		if ((netif_queue_stopped(dev) ||
+		     netif_subqueue_stopped(dev, skb->queue_mapping)) ||
+		     dev->hard_start_xmit(skb, dev) != NETDEV_TX_OK) {
 			skb_queue_head(&npinfo->txq, skb);
 			netif_tx_unlock(dev);
 			local_irq_restore(flags);
@@ -254,7 +255,8 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 		for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
 		     tries > 0; --tries) {
 			if (netif_tx_trylock(dev)) {
-				if (!netif_queue_stopped(dev))
+				if (!netif_queue_stopped(dev) &&
+				    !netif_subqueue_stopped(dev, skb->queue_mapping))
 					status = dev->hard_start_xmit(skb, dev);
 				netif_tx_unlock(dev);
 
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 9cd3a1cb60ef..dffe067e7a7b 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3139,7 +3139,9 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
 		}
 	}
 
-	if (netif_queue_stopped(odev) || need_resched()) {
+	if ((netif_queue_stopped(odev) ||
+	     netif_subqueue_stopped(odev, pkt_dev->skb->queue_mapping)) ||
+	     need_resched()) {
 		idle_start = getCurUs();
 
 		if (!netif_running(odev)) {
@@ -3154,7 +3156,8 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
 
 		pkt_dev->idle_acc += getCurUs() - idle_start;
 
-		if (netif_queue_stopped(odev)) {
+		if (netif_queue_stopped(odev) ||
+		    netif_subqueue_stopped(odev, pkt_dev->skb->queue_mapping)) {
 			pkt_dev->next_tx_us = getCurUs();	/* TODO */
 			pkt_dev->next_tx_ns = 0;
 			goto out;	/* Try the next interface */
@@ -3181,7 +3184,8 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
 	}
 
 	netif_tx_lock_bh(odev);
-	if (!netif_queue_stopped(odev)) {
+	if (!netif_queue_stopped(odev) &&
+	    !netif_subqueue_stopped(odev, pkt_dev->skb->queue_mapping)) {
 
 		atomic_inc(&(pkt_dev->skb->users));
 	      retry_now:
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c989c3a0f907..6a41b96b3d37 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -419,6 +419,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 	n->nohdr = 0;
 	C(pkt_type);
 	C(ip_summed);
+	skb_copy_queue_mapping(n, skb);
 	C(priority);
 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 	C(ipvs_property);
@@ -460,6 +461,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #endif
 	new->sk		= NULL;
 	new->dev	= old->dev;
+	skb_copy_queue_mapping(new, old);
 	new->priority	= old->priority;
 	new->protocol	= old->protocol;
 	new->dst	= dst_clone(old->dst);
@@ -1932,6 +1934,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
 		tail = nskb;
 
 		nskb->dev = skb->dev;
+		skb_copy_queue_mapping(nskb, skb);
 		nskb->priority = skb->priority;
 		nskb->protocol = skb->protocol;
 		nskb->dst = dst_clone(skb->dst);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 0ac2524f3b68..1387e5411f77 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -316,9 +316,10 @@ void ether_setup(struct net_device *dev)
 EXPORT_SYMBOL(ether_setup);
 
 /**
- * alloc_etherdev - Allocates and sets up an Ethernet device
+ * alloc_etherdev_mq - Allocates and sets up an Ethernet device
  * @sizeof_priv: Size of additional driver-private structure to be allocated
  *	for this Ethernet device
+ * @queue_count: The number of queues this device has.
  *
  * Fill in the fields of the device structure with Ethernet-generic
  * values. Basically does everything except registering the device.
@@ -328,8 +329,8 @@ EXPORT_SYMBOL(ether_setup);
  * this private data area.
  */
 
-struct net_device *alloc_etherdev(int sizeof_priv)
+struct net_device *alloc_etherdev_mq(int sizeof_priv, unsigned int queue_count)
 {
-	return alloc_netdev(sizeof_priv, "eth%d", ether_setup);
+	return alloc_netdev_mq(sizeof_priv, "eth%d", ether_setup, queue_count);
 }
-EXPORT_SYMBOL(alloc_etherdev);
+EXPORT_SYMBOL(alloc_etherdev_mq);
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index f05ad9a30b4c..dfe7e4520988 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -277,6 +277,7 @@ static int teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
 	int busy;
 	int nores;
 	int len = skb->len;
+	int subq = skb->queue_mapping;
 	struct sk_buff *skb_res = NULL;
 
 	start = master->slaves;
@@ -293,7 +294,9 @@ restart:
 
 		if (slave->qdisc_sleeping != q)
 			continue;
-		if (netif_queue_stopped(slave) || ! netif_running(slave)) {
+		if (netif_queue_stopped(slave) ||
+		    netif_subqueue_stopped(slave, subq) ||
+		    !netif_running(slave)) {
 			busy = 1;
 			continue;
 		}
@@ -302,6 +305,7 @@ restart:
 		case 0:
 			if (netif_tx_trylock(slave)) {
 				if (!netif_queue_stopped(slave) &&
+				    !netif_subqueue_stopped(slave, subq) &&
 				    slave->hard_start_xmit(skb, slave) == 0) {
 					netif_tx_unlock(slave);
 					master->slaves = NEXT_SLAVE(q);
-- 
cgit v1.3-8-gc7d7


From d62733c8e437fdb58325617c4b3331769ba82d70 Mon Sep 17 00:00:00 2001
From: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Date: Thu, 28 Jun 2007 21:04:31 -0700
Subject: [SCHED]: Qdisc changes and sch_rr added for multiqueue

Add the new sch_rr qdisc for multiqueue network device support.  Allow
sch_prio and sch_rr to be compiled with or without multiqueue hardware
support.

sch_rr is part of sch_prio, and is referenced from MODULE_ALIAS.  This
was done since sch_prio and sch_rr only differ in their dequeue
routine.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pkt_sched.h |   9 ++++
 net/sched/Kconfig         |  11 ++++
 net/sched/sch_prio.c      | 129 ++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 134 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index d10f35338507..268c51599eb8 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -101,6 +101,15 @@ struct tc_prio_qopt
 	__u8	priomap[TC_PRIO_MAX+1];	/* Map: logical priority -> PRIO band */
 };
 
+enum
+{
+	TCA_PRIO_UNSPEC,
+	TCA_PRIO_MQ,
+	__TCA_PRIO_MAX
+};
+
+#define TCA_PRIO_MAX    (__TCA_PRIO_MAX - 1)
+
 /* TBF section */
 
 struct tc_tbf_qopt
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 475df8449be9..f3217942ca87 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -111,6 +111,17 @@ config NET_SCH_PRIO
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_prio.
 
+config NET_SCH_RR
+	tristate "Multi Band Round Robin Queuing (RR)"
+	select NET_SCH_PRIO
+	---help---
+	  Say Y here if you want to use an n-band round robin packet
+	  scheduler.
+
+	  The module uses sch_prio for its framework and is aliased as
+	  sch_rr, so it will load sch_prio, although it is referred
+	  to using sch_rr.
+
 config NET_SCH_RED
 	tristate "Random Early Detection (RED)"
 	---help---
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 6d7542c26e47..404522046289 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -40,9 +40,11 @@
 struct prio_sched_data
 {
 	int bands;
+	int curband; /* for round-robin */
 	struct tcf_proto *filter_list;
 	u8  prio2band[TC_PRIO_MAX+1];
 	struct Qdisc *queues[TCQ_PRIO_BANDS];
+	int mq;
 };
 
 
@@ -70,14 +72,17 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 #endif
 			if (TC_H_MAJ(band))
 				band = 0;
-			return q->queues[q->prio2band[band&TC_PRIO_MAX]];
+			band = q->prio2band[band&TC_PRIO_MAX];
+			goto out;
 		}
 		band = res.classid;
 	}
 	band = TC_H_MIN(band) - 1;
 	if (band >= q->bands)
-		return q->queues[q->prio2band[0]];
-
+		band = q->prio2band[0];
+out:
+	if (q->mq)
+		skb_set_queue_mapping(skb, band);
 	return q->queues[band];
 }
 
@@ -144,17 +149,58 @@ prio_dequeue(struct Qdisc* sch)
 	struct Qdisc *qdisc;
 
 	for (prio = 0; prio < q->bands; prio++) {
-		qdisc = q->queues[prio];
-		skb = qdisc->dequeue(qdisc);
-		if (skb) {
-			sch->q.qlen--;
-			return skb;
+		/* Check if the target subqueue is available before
+		 * pulling an skb.  This way we avoid excessive requeues
+		 * for slower queues.
+		 */
+		if (!netif_subqueue_stopped(sch->dev, (q->mq ? prio : 0))) {
+			qdisc = q->queues[prio];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				sch->q.qlen--;
+				return skb;
+			}
 		}
 	}
 	return NULL;
 
 }
 
+static struct sk_buff *rr_dequeue(struct Qdisc* sch)
+{
+	struct sk_buff *skb;
+	struct prio_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	int bandcount;
+
+	/* Only take one pass through the queues.  If nothing is available,
+	 * return nothing.
+	 */
+	for (bandcount = 0; bandcount < q->bands; bandcount++) {
+		/* Check if the target subqueue is available before
+		 * pulling an skb.  This way we avoid excessive requeues
+		 * for slower queues.  If the queue is stopped, try the
+		 * next queue.
+		 */
+		if (!netif_subqueue_stopped(sch->dev,
+					    (q->mq ? q->curband : 0))) {
+			qdisc = q->queues[q->curband];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				sch->q.qlen--;
+				q->curband++;
+				if (q->curband >= q->bands)
+					q->curband = 0;
+				return skb;
+			}
+		}
+		q->curband++;
+		if (q->curband >= q->bands)
+			q->curband = 0;
+	}
+	return NULL;
+}
+
 static unsigned int prio_drop(struct Qdisc* sch)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
@@ -198,21 +244,41 @@ prio_destroy(struct Qdisc* sch)
 static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
-	struct tc_prio_qopt *qopt = RTA_DATA(opt);
+	struct tc_prio_qopt *qopt;
+	struct rtattr *tb[TCA_PRIO_MAX];
 	int i;
 
-	if (opt->rta_len < RTA_LENGTH(sizeof(*qopt)))
+	if (rtattr_parse_nested_compat(tb, TCA_PRIO_MAX, opt, qopt,
+				       sizeof(*qopt)))
 		return -EINVAL;
-	if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
+	q->bands = qopt->bands;
+	/* If we're multiqueue, make sure the number of incoming bands
+	 * matches the number of queues on the device we're associating with.
+	 * If the number of bands requested is zero, then set q->bands to
+	 * dev->egress_subqueue_count.
+	 */
+	q->mq = RTA_GET_FLAG(tb[TCA_PRIO_MQ - 1]);
+	if (q->mq) {
+		if (sch->handle != TC_H_ROOT)
+			return -EINVAL;
+		if (netif_is_multiqueue(sch->dev)) {
+			if (q->bands == 0)
+				q->bands = sch->dev->egress_subqueue_count;
+			else if (q->bands != sch->dev->egress_subqueue_count)
+				return -EINVAL;
+		} else
+			return -EOPNOTSUPP;
+	}
+
+	if (q->bands > TCQ_PRIO_BANDS || q->bands < 2)
 		return -EINVAL;
 
 	for (i=0; i<=TC_PRIO_MAX; i++) {
-		if (qopt->priomap[i] >= qopt->bands)
+		if (qopt->priomap[i] >= q->bands)
 			return -EINVAL;
 	}
 
 	sch_tree_lock(sch);
-	q->bands = qopt->bands;
 	memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
 
 	for (i=q->bands; i<TCQ_PRIO_BANDS; i++) {
@@ -268,11 +334,17 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 	unsigned char *b = skb_tail_pointer(skb);
+	struct rtattr *nest;
 	struct tc_prio_qopt opt;
 
 	opt.bands = q->bands;
 	memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
-	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+
+	nest = RTA_NEST_COMPAT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	if (q->mq)
+		RTA_PUT_FLAG(skb, TCA_PRIO_MQ);
+	RTA_NEST_COMPAT_END(skb, nest);
+
 	return skb->len;
 
 rtattr_failure:
@@ -443,17 +515,44 @@ static struct Qdisc_ops prio_qdisc_ops = {
 	.owner		=	THIS_MODULE,
 };
 
+static struct Qdisc_ops rr_qdisc_ops = {
+	.next		=	NULL,
+	.cl_ops		=	&prio_class_ops,
+	.id		=	"rr",
+	.priv_size	=	sizeof(struct prio_sched_data),
+	.enqueue	=	prio_enqueue,
+	.dequeue	=	rr_dequeue,
+	.requeue	=	prio_requeue,
+	.drop		=	prio_drop,
+	.init		=	prio_init,
+	.reset		=	prio_reset,
+	.destroy	=	prio_destroy,
+	.change		=	prio_tune,
+	.dump		=	prio_dump,
+	.owner		=	THIS_MODULE,
+};
+
 static int __init prio_module_init(void)
 {
-	return register_qdisc(&prio_qdisc_ops);
+	int err;
+
+	err = register_qdisc(&prio_qdisc_ops);
+	if (err < 0)
+		return err;
+	err = register_qdisc(&rr_qdisc_ops);
+	if (err < 0)
+		unregister_qdisc(&prio_qdisc_ops);
+	return err;
 }
 
 static void __exit prio_module_exit(void)
 {
 	unregister_qdisc(&prio_qdisc_ops);
+	unregister_qdisc(&rr_qdisc_ops);
 }
 
 module_init(prio_module_init)
 module_exit(prio_module_exit)
 
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("sch_rr");
-- 
cgit v1.3-8-gc7d7


From 61cbc2fca6335be52788773b21efdc52a2750924 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sat, 30 Jun 2007 13:35:52 -0700
Subject: [NET]: Fix secondary unicast/multicast address count maintenance

When a reference to an existing address is increased or decreased without
hitting zero, the address count is incorrectly adjusted.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 ++--
 net/core/dev.c            | 21 ++++++++++-----------
 net/core/dev_mcast.c      | 11 ++++-------
 3 files changed, 16 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9817821729c4..8590d685d935 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1091,8 +1091,8 @@ extern int		dev_unicast_add(struct net_device *dev, void *addr, int alen);
 extern int 		dev_mc_delete(struct net_device *dev, void *addr, int alen, int all);
 extern int		dev_mc_add(struct net_device *dev, void *addr, int alen, int newonly);
 extern void		dev_mc_discard(struct net_device *dev);
-extern int 		__dev_addr_delete(struct dev_addr_list **list, void *addr, int alen, int all);
-extern int		__dev_addr_add(struct dev_addr_list **list, void *addr, int alen, int newonly);
+extern int 		__dev_addr_delete(struct dev_addr_list **list, int *count, void *addr, int alen, int all);
+extern int		__dev_addr_add(struct dev_addr_list **list, int *count, void *addr, int alen, int newonly);
 extern void		__dev_addr_discard(struct dev_addr_list **list);
 extern void		dev_set_promiscuity(struct net_device *dev, int inc);
 extern void		dev_set_allmulti(struct net_device *dev, int inc);
diff --git a/net/core/dev.c b/net/core/dev.c
index 7ddf66d0ad5e..4221dcda88d7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2607,8 +2607,8 @@ void dev_set_rx_mode(struct net_device *dev)
 	netif_tx_unlock_bh(dev);
 }
 
-int __dev_addr_delete(struct dev_addr_list **list, void *addr, int alen,
-		      int glbl)
+int __dev_addr_delete(struct dev_addr_list **list, int *count,
+		      void *addr, int alen, int glbl)
 {
 	struct dev_addr_list *da;
 
@@ -2626,13 +2626,15 @@ int __dev_addr_delete(struct dev_addr_list **list, void *addr, int alen,
 
 			*list = da->next;
 			kfree(da);
+			(*count)--;
 			return 0;
 		}
 	}
 	return -ENOENT;
 }
 
-int __dev_addr_add(struct dev_addr_list **list, void *addr, int alen, int glbl)
+int __dev_addr_add(struct dev_addr_list **list, int *count,
+		   void *addr, int alen, int glbl)
 {
 	struct dev_addr_list *da;
 
@@ -2659,6 +2661,7 @@ int __dev_addr_add(struct dev_addr_list **list, void *addr, int alen, int glbl)
 	da->da_gusers = glbl ? 1 : 0;
 	da->next = *list;
 	*list = da;
+	(*count)++;
 	return 0;
 }
 
@@ -2692,11 +2695,9 @@ int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
 	ASSERT_RTNL();
 
 	netif_tx_lock_bh(dev);
-	err = __dev_addr_delete(&dev->uc_list, addr, alen, 0);
-	if (!err) {
-		dev->uc_count--;
+	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
+	if (!err)
 		__dev_set_rx_mode(dev);
-	}
 	netif_tx_unlock_bh(dev);
 	return err;
 }
@@ -2718,11 +2719,9 @@ int dev_unicast_add(struct net_device *dev, void *addr, int alen)
 	ASSERT_RTNL();
 
 	netif_tx_lock_bh(dev);
-	err = __dev_addr_add(&dev->uc_list, addr, alen, 0);
-	if (!err) {
-		dev->uc_count++;
+	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
+	if (!err)
 		__dev_set_rx_mode(dev);
-	}
 	netif_tx_unlock_bh(dev);
 	return err;
 }
diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
index 5cc9b448c443..aa38100601fb 100644
--- a/net/core/dev_mcast.c
+++ b/net/core/dev_mcast.c
@@ -72,10 +72,9 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
 	int err;
 
 	netif_tx_lock_bh(dev);
-	err = __dev_addr_delete(&dev->mc_list, addr, alen, glbl);
+	err = __dev_addr_delete(&dev->mc_list, &dev->mc_count,
+				addr, alen, glbl);
 	if (!err) {
-		dev->mc_count--;
-
 		/*
 		 *	We have altered the list, so the card
 		 *	loaded filter is now wrong. Fix it
@@ -96,11 +95,9 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
 	int err;
 
 	netif_tx_lock_bh(dev);
-	err = __dev_addr_add(&dev->mc_list, addr, alen, glbl);
-	if (!err) {
-		dev->mc_count++;
+	err = __dev_addr_add(&dev->mc_list, &dev->mc_count, addr, alen, glbl);
+	if (!err)
 		__dev_set_rx_mode(dev);
-	}
 	netif_tx_unlock_bh(dev);
 	return err;
 }
-- 
cgit v1.3-8-gc7d7


From 8c644623fe7e41f59fe97cdf666cba3cb7ced7d8 Mon Sep 17 00:00:00 2001
From: Guido Guenther <agx@sigxcpu.org>
Date: Mon, 2 Jul 2007 22:50:25 -0700
Subject: [NET]: Allow group ownership of TUN/TAP devices.

Introduce a new syscall TUNSETGROUP for group ownership setting of tap
devices. The user now is allowed to send packages if either his euid or
his egid matches the one specified via tunctl (via -u or -g
respecitvely). If both, gid and uid, are set via tunctl, both have to
match.

Signed-off-by: Guido Guenther <agx@sigxcpu.org>
Signed-off-by: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c      | 15 +++++++++++++--
 include/linux/if_tun.h |  2 ++
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index a2c6caaaae93..62b2b3005019 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -432,6 +432,7 @@ static void tun_setup(struct net_device *dev)
 	init_waitqueue_head(&tun->read_wait);
 
 	tun->owner = -1;
+	tun->group = -1;
 
 	SET_MODULE_OWNER(dev);
 	dev->open = tun_net_open;
@@ -467,8 +468,11 @@ static int tun_set_iff(struct file *file, struct ifreq *ifr)
 			return -EBUSY;
 
 		/* Check permissions */
-		if (tun->owner != -1 &&
-		    current->euid != tun->owner && !capable(CAP_NET_ADMIN))
+		if (((tun->owner != -1 &&
+		      current->euid != tun->owner) ||
+		     (tun->group != -1 &&
+		      current->egid != tun->group)) &&
+		     !capable(CAP_NET_ADMIN))
 			return -EPERM;
 	}
 	else if (__dev_get_by_name(ifr->ifr_name))
@@ -610,6 +614,13 @@ static int tun_chr_ioctl(struct inode *inode, struct file *file,
 		DBG(KERN_INFO "%s: owner set to %d\n", tun->dev->name, tun->owner);
 		break;
 
+	case TUNSETGROUP:
+		/* Set group of the device */
+		tun->group= (gid_t) arg;
+
+		DBG(KERN_INFO "%s: group set to %d\n", tun->dev->name, tun->group);
+		break;
+
 	case TUNSETLINK:
 		/* Only allow setting the type when the interface is down */
 		if (tun->dev->flags & IFF_UP) {
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 88aef7b86ef4..42eb6945b93e 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -36,6 +36,7 @@ struct tun_struct {
 	unsigned long 		flags;
 	int			attached;
 	uid_t			owner;
+	gid_t			group;
 
 	wait_queue_head_t	read_wait;
 	struct sk_buff_head	readq;
@@ -78,6 +79,7 @@ struct tun_struct {
 #define TUNSETPERSIST _IOW('T', 203, int) 
 #define TUNSETOWNER   _IOW('T', 204, int)
 #define TUNSETLINK    _IOW('T', 205, int)
+#define TUNSETGROUP   _IOW('T', 206, int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
-- 
cgit v1.3-8-gc7d7


From 89da1ecf5483e6aa29b456a15ad6d05a6797c5a5 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <samuel@sortiz.org>
Date: Mon, 2 Jul 2007 22:54:18 -0700
Subject: [IrDA]: Netlink layer.

First IrDA configuration netlink layer implementation.
Currently, we only support the set/get mode commands.

Signed-off-by: Samuel Ortiz <samuel@sortiz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/irda.h     |  27 ++++++++
 include/net/irda/irda.h  |   3 +
 include/net/irda/irlap.h |   2 +
 net/irda/Makefile        |   2 +-
 net/irda/irmod.c         |  48 +++++++++++--
 net/irda/irnetlink.c     | 170 +++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 247 insertions(+), 5 deletions(-)
 create mode 100644 net/irda/irnetlink.c

(limited to 'include/linux')

diff --git a/include/linux/irda.h b/include/linux/irda.h
index 945ba3110874..35911bd4dbe8 100644
--- a/include/linux/irda.h
+++ b/include/linux/irda.h
@@ -216,6 +216,33 @@ struct if_irda_req {
 #define ifr_dtr       ifr_ifru.ifru_line.dtr
 #define ifr_rts       ifr_ifru.ifru_line.rts
 
+
+/* IrDA netlink definitions */
+#define IRDA_NL_NAME "irda"
+#define IRDA_NL_VERSION 1
+
+enum irda_nl_commands {
+	IRDA_NL_CMD_UNSPEC,
+	IRDA_NL_CMD_SET_MODE,
+	IRDA_NL_CMD_GET_MODE,
+
+	__IRDA_NL_CMD_AFTER_LAST
+};
+#define IRDA_NL_CMD_MAX (__IRDA_NL_CMD_AFTER_LAST - 1)
+
+enum nl80211_attrs {
+	IRDA_NL_ATTR_UNSPEC,
+	IRDA_NL_ATTR_IFNAME,
+	IRDA_NL_ATTR_MODE,
+
+	__IRDA_NL_ATTR_AFTER_LAST
+};
+#define IRDA_NL_ATTR_MAX (__IRDA_NL_ATTR_AFTER_LAST - 1)
+
+/* IrDA modes */
+#define IRDA_MODE_PRIMARY   0x1
+#define IRDA_MODE_SECONDARY 0x2
+
 #endif /* KERNEL_IRDA_H */
 
 
diff --git a/include/net/irda/irda.h b/include/net/irda/irda.h
index 36bee441aa56..08387553b57e 100644
--- a/include/net/irda/irda.h
+++ b/include/net/irda/irda.h
@@ -125,6 +125,9 @@ extern void irda_sysctl_unregister(void);
 extern int irsock_init(void);
 extern void irsock_cleanup(void);
 
+extern int irda_nl_register(void);
+extern void irda_nl_unregister(void);
+
 extern int irlap_driver_rcv(struct sk_buff *skb, struct net_device *dev,
 			    struct packet_type *ptype,
 			    struct net_device *orig_dev);
diff --git a/include/net/irda/irlap.h b/include/net/irda/irlap.h
index a3d370efb903..9d0c78ea92f5 100644
--- a/include/net/irda/irlap.h
+++ b/include/net/irda/irlap.h
@@ -208,6 +208,8 @@ struct irlap_cb {
 	int    xbofs_delay;   /* Nr of XBOF's used to MTT */
 	int    bofs_count;    /* Negotiated extra BOFs */
 	int    next_bofs;     /* Negotiated extra BOFs after next frame */
+
+	int    mode;     /* IrLAP mode (primary, secondary or monitor) */
 };
 
 /* 
diff --git a/net/irda/Makefile b/net/irda/Makefile
index d1366c2a39cb..187f6c563a4b 100644
--- a/net/irda/Makefile
+++ b/net/irda/Makefile
@@ -10,6 +10,6 @@ obj-$(CONFIG_IRCOMM) += ircomm/
 irda-y := iriap.o iriap_event.o irlmp.o irlmp_event.o irlmp_frame.o \
           irlap.o irlap_event.o irlap_frame.o timer.o qos.o irqueue.o \
           irttp.o irda_device.o irias_object.o wrapper.o af_irda.o \
-	  discovery.o parameters.o irmod.o
+	  discovery.o parameters.o irnetlink.o irmod.o
 irda-$(CONFIG_PROC_FS) += irproc.o
 irda-$(CONFIG_SYSCTL) += irsysctl.o
diff --git a/net/irda/irmod.c b/net/irda/irmod.c
index c7fad2c5b9f3..1900937b3328 100644
--- a/net/irda/irmod.c
+++ b/net/irda/irmod.c
@@ -88,16 +88,23 @@ EXPORT_SYMBOL(irda_notify_init);
  */
 static int __init irda_init(void)
 {
+	int ret = 0;
+
 	IRDA_DEBUG(0, "%s()\n", __FUNCTION__);
 
 	/* Lower layer of the stack */
 	irlmp_init();
 	irlap_init();
 
+	/* Driver/dongle support */
+	irda_device_init();
+
 	/* Higher layers of the stack */
 	iriap_init();
 	irttp_init();
-	irsock_init();
+	ret = irsock_init();
+	if (ret < 0)
+		goto out_err_1;
 
 	/* Add IrDA packet type (Start receiving packets) */
 	dev_add_pack(&irda_packet_type);
@@ -107,13 +114,44 @@ static int __init irda_init(void)
 	irda_proc_register();
 #endif
 #ifdef CONFIG_SYSCTL
-	irda_sysctl_register();
+	ret = irda_sysctl_register();
+	if (ret < 0)
+		goto out_err_2;
 #endif
 
-	/* Driver/dongle support */
-	irda_device_init();
+	ret = irda_nl_register();
+	if (ret < 0)
+		goto out_err_3;
 
 	return 0;
+
+ out_err_3:
+#ifdef CONFIG_SYSCTL
+	irda_sysctl_unregister();
+#endif
+ out_err_2:
+#ifdef CONFIG_PROC_FS
+	irda_proc_unregister();
+#endif
+
+	/* Remove IrDA packet type (stop receiving packets) */
+	dev_remove_pack(&irda_packet_type);
+
+	/* Remove higher layers */
+	irsock_cleanup();
+ out_err_1:
+	irttp_cleanup();
+	iriap_cleanup();
+
+	/* Remove lower layers */
+	irda_device_cleanup();
+	irlap_cleanup(); /* Must be done before irlmp_cleanup()! DB */
+
+	/* Remove middle layer */
+	irlmp_cleanup();
+
+
+	return ret;
 }
 
 /*
@@ -125,6 +163,8 @@ static int __init irda_init(void)
 static void __exit irda_cleanup(void)
 {
 	/* Remove External APIs */
+	irda_nl_unregister();
+
 #ifdef CONFIG_SYSCTL
 	irda_sysctl_unregister();
 #endif
diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c
new file mode 100644
index 000000000000..db716580e1ae
--- /dev/null
+++ b/net/irda/irnetlink.c
@@ -0,0 +1,170 @@
+/*
+ * IrDA netlink layer, for stack configuration.
+ *
+ * Copyright (c) 2007 Samuel Ortiz <samuel@sortiz>
+ *
+ * Partly based on the 802.11 nelink implementation
+ * (see net/wireless/nl80211.c) which is:
+ * Copyright 2006 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/socket.h>
+#include <linux/irda.h>
+#include <net/sock.h>
+#include <net/irda/irda.h>
+#include <net/irda/irlap.h>
+#include <net/genetlink.h>
+
+
+
+static struct genl_family irda_nl_family = {
+	.id = GENL_ID_GENERATE,
+	.name = IRDA_NL_NAME,
+	.hdrsize = 0,
+	.version = IRDA_NL_VERSION,
+	.maxattr = IRDA_NL_CMD_MAX,
+};
+
+static struct net_device * ifname_to_netdev(struct genl_info *info)
+{
+	char * ifname;
+
+	if (!info->attrs[IRDA_NL_ATTR_IFNAME])
+		return NULL;
+
+	ifname = nla_data(info->attrs[IRDA_NL_ATTR_IFNAME]);
+
+	IRDA_DEBUG(5, "%s(): Looking for %s\n", __FUNCTION__, ifname);
+
+	return dev_get_by_name(ifname);
+}
+
+static int irda_nl_set_mode(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net_device * dev;
+	struct irlap_cb * irlap;
+	u32 mode;
+
+	if (!info->attrs[IRDA_NL_ATTR_MODE])
+		return -EINVAL;
+
+	mode = nla_get_u32(info->attrs[IRDA_NL_ATTR_MODE]);
+
+	IRDA_DEBUG(5, "%s(): Switching to mode: %d\n", __FUNCTION__, mode);
+
+	dev = ifname_to_netdev(info);
+	if (!dev)
+		return -ENODEV;
+
+	irlap = (struct irlap_cb *)dev->atalk_ptr;
+	if (!irlap) {
+		dev_put(dev);
+		return -ENODEV;
+	}
+
+	irlap->mode = mode;
+
+	dev_put(dev);
+
+	return 0;
+}
+
+static int irda_nl_get_mode(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net_device * dev;
+	struct irlap_cb * irlap;
+	struct sk_buff *msg;
+	void *hdr;
+	int ret = -ENOBUFS;
+
+	dev = ifname_to_netdev(info);
+	if (!dev)
+		return -ENODEV;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg) {
+		dev_put(dev);
+		return -ENOMEM;
+	}
+
+	irlap = (struct irlap_cb *)dev->atalk_ptr;
+	if (!irlap) {
+		ret = -ENODEV;
+		goto err_out;
+	}
+
+	hdr = genlmsg_put(msg, info->snd_pid, info->snd_seq,
+			  &irda_nl_family, 0,  IRDA_NL_CMD_GET_MODE);
+	if (IS_ERR(hdr)) {
+		ret = PTR_ERR(hdr);
+		goto err_out;
+	}
+
+	if(nla_put_string(msg, IRDA_NL_ATTR_IFNAME,
+			  dev->name));
+		goto err_out;
+
+	if(nla_put_u32(msg, IRDA_NL_ATTR_MODE, irlap->mode))
+		goto err_out;
+
+	genlmsg_end(msg, hdr);
+
+	return genlmsg_unicast(msg, info->snd_pid);
+
+ err_out:
+	nlmsg_free(msg);
+	dev_put(dev);
+
+	return ret;
+}
+
+static struct nla_policy irda_nl_policy[IRDA_NL_ATTR_MAX + 1] = {
+	[IRDA_NL_ATTR_IFNAME] = { .type = NLA_NUL_STRING,
+				  .len = IFNAMSIZ-1 },
+	[IRDA_NL_ATTR_MODE] = { .type = NLA_U32 },
+};
+
+static struct genl_ops irda_nl_ops[] = {
+	{
+		.cmd = IRDA_NL_CMD_SET_MODE,
+		.doit = irda_nl_set_mode,
+		.policy = irda_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = IRDA_NL_CMD_GET_MODE,
+		.doit = irda_nl_get_mode,
+		.policy = irda_nl_policy,
+		/* can be retrieved by unprivileged users */
+	},
+
+};
+
+int irda_nl_register(void)
+{
+	int err, i;
+
+	err = genl_register_family(&irda_nl_family);
+	if (err)
+		return err;
+
+	for (i = 0; i < ARRAY_SIZE(irda_nl_ops); i++) {
+		err = genl_register_ops(&irda_nl_family, &irda_nl_ops[i]);
+		if (err)
+			goto err_out;
+	}
+	return 0;
+ err_out:
+	genl_unregister_family(&irda_nl_family);
+	return err;
+}
+
+void irda_nl_unregister(void)
+{
+	genl_unregister_family(&irda_nl_family);
+}
-- 
cgit v1.3-8-gc7d7


From 411725280bd0058ebb83c0e32133b7a94902c3a6 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <samuel@sortiz.org>
Date: Mon, 2 Jul 2007 22:55:31 -0700
Subject: [IrDA]: Monitor mode.

Through the IrDA netlink set mode command, we switch to IrDA monitor
mode, where one IrLAP instance receives all the packets on the media,
without ever responding to them.

Signed-off-by: Samuel Ortiz <samuel@sortiz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/irda.h   | 1 +
 net/irda/irlap_frame.c | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irda.h b/include/linux/irda.h
index 35911bd4dbe8..8e3735714c1c 100644
--- a/include/linux/irda.h
+++ b/include/linux/irda.h
@@ -242,6 +242,7 @@ enum nl80211_attrs {
 /* IrDA modes */
 #define IRDA_MODE_PRIMARY   0x1
 #define IRDA_MODE_SECONDARY 0x2
+#define IRDA_MODE_MONITOR   0x4
 
 #endif /* KERNEL_IRDA_H */
 
diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
index 3013c49ab975..25a3444a9234 100644
--- a/net/irda/irlap_frame.c
+++ b/net/irda/irlap_frame.c
@@ -101,6 +101,13 @@ void irlap_queue_xmit(struct irlap_cb *self, struct sk_buff *skb)
 
 	irlap_insert_info(self, skb);
 
+	if (unlikely(self->mode & IRDA_MODE_MONITOR)) {
+		IRDA_DEBUG(3, "%s(): %s is in monitor mode\n", __FUNCTION__,
+			   self->netdev->name);
+		dev_kfree_skb(skb);
+		return;
+	}
+
 	dev_queue_xmit(skb);
 }
 
-- 
cgit v1.3-8-gc7d7


From 7bfe24611671ec76b44281e582b38535e21f01a9 Mon Sep 17 00:00:00 2001
From: Yasuyuki Kozakai <yasuyuki@netfilter.org>
Date: Sat, 7 Jul 2007 22:14:23 -0700
Subject: [NETFILTER]: ip6_tables: fix explanation of valid upper protocol
 number

This explains the allowed upper protocol numbers. IP6T_F_NOPROTO was
introduced to use 0 as Hop-by-Hop option header, not wildcard. But that
seemed to be forgotten. 0 has been used as wildcard since 2002-08-23.

Signed-off-by: Yasuyuki Kozakai <yasuyuki@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv6/ip6_tables.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index 4686f8342cbd..9a720f05888f 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -44,8 +44,14 @@ struct ip6t_ip6 {
 	char iniface[IFNAMSIZ], outiface[IFNAMSIZ];
 	unsigned char iniface_mask[IFNAMSIZ], outiface_mask[IFNAMSIZ];
 
-	/* ARGH, HopByHop uses 0, so can't do 0 = ANY,
-	   instead IP6T_F_NOPROTO must be set */
+	/* Upper protocol number
+	 * - The allowed value is 0 (any) or protocol number of last parsable
+	 *   header, which is 50 (ESP), 59 (No Next Header), 135 (MH), or
+	 *   the non IPv6 extension headers.
+	 * - The protocol numbers of IPv6 extension headers except of ESP and
+	 *   MH do not match any packets.
+	 * - You also need to set IP6T_FLAGS_PROTO to "flags" to check protocol.
+	 */
 	u_int16_t proto;
 	/* TOS to match iff flags & IP6T_F_TOS */
 	u_int8_t tos;
-- 
cgit v1.3-8-gc7d7


From cff533ac12494fa002e2c46acc94d670e5f636a2 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@gmx.de>
Date: Sat, 7 Jul 2007 22:15:12 -0700
Subject: [NETFILTER]: x_tables: switch hotdrop to bool

Switch the "hotdrop" variables to boolean

Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h   |  2 +-
 net/ipv4/netfilter/arp_tables.c      |  2 +-
 net/ipv4/netfilter/ip_tables.c       |  8 ++++----
 net/ipv4/netfilter/ipt_addrtype.c    |  2 +-
 net/ipv4/netfilter/ipt_ah.c          |  4 ++--
 net/ipv4/netfilter/ipt_ecn.c         |  6 +++---
 net/ipv4/netfilter/ipt_iprange.c     |  2 +-
 net/ipv4/netfilter/ipt_owner.c       |  2 +-
 net/ipv4/netfilter/ipt_recent.c      |  4 ++--
 net/ipv4/netfilter/ipt_tos.c         |  2 +-
 net/ipv4/netfilter/ipt_ttl.c         |  2 +-
 net/ipv6/netfilter/ip6_tables.c      | 12 ++++++------
 net/ipv6/netfilter/ip6t_ah.c         |  6 +++---
 net/ipv6/netfilter/ip6t_eui64.c      |  4 ++--
 net/ipv6/netfilter/ip6t_frag.c       |  6 +++---
 net/ipv6/netfilter/ip6t_hbh.c        |  6 +++---
 net/ipv6/netfilter/ip6t_hl.c         |  2 +-
 net/ipv6/netfilter/ip6t_ipv6header.c |  2 +-
 net/ipv6/netfilter/ip6t_mh.c         |  6 +++---
 net/ipv6/netfilter/ip6t_owner.c      |  2 +-
 net/ipv6/netfilter/ip6t_rt.c         |  6 +++---
 net/netfilter/xt_comment.c           |  2 +-
 net/netfilter/xt_connbytes.c         |  2 +-
 net/netfilter/xt_connmark.c          |  2 +-
 net/netfilter/xt_conntrack.c         |  2 +-
 net/netfilter/xt_dccp.c              | 12 ++++++------
 net/netfilter/xt_dscp.c              |  4 ++--
 net/netfilter/xt_esp.c               |  4 ++--
 net/netfilter/xt_hashlimit.c         |  4 ++--
 net/netfilter/xt_helper.c            |  2 +-
 net/netfilter/xt_length.c            |  4 ++--
 net/netfilter/xt_limit.c             |  2 +-
 net/netfilter/xt_mac.c               |  2 +-
 net/netfilter/xt_mark.c              |  2 +-
 net/netfilter/xt_multiport.c         |  8 ++++----
 net/netfilter/xt_physdev.c           |  2 +-
 net/netfilter/xt_pkttype.c           |  2 +-
 net/netfilter/xt_policy.c            |  2 +-
 net/netfilter/xt_quota.c             |  2 +-
 net/netfilter/xt_realm.c             |  2 +-
 net/netfilter/xt_sctp.c              |  8 ++++----
 net/netfilter/xt_state.c             |  2 +-
 net/netfilter/xt_statistic.c         |  2 +-
 net/netfilter/xt_string.c            |  2 +-
 net/netfilter/xt_tcpmss.c            |  4 ++--
 net/netfilter/xt_tcpudp.c            | 16 ++++++++--------
 46 files changed, 92 insertions(+), 92 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 7e733a6ba4f6..b8577d18d10d 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -148,7 +148,7 @@ struct xt_match
 		     const void *matchinfo,
 		     int offset,
 		     unsigned int protoff,
-		     int *hotdrop);
+		     bool *hotdrop);
 
 	/* Called when user tries to insert an entry of this type. */
 	/* Should return true or false. */
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index cae41215e3c7..1d75a5cd7b44 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -224,7 +224,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb,
 	static const char nulldevname[IFNAMSIZ];
 	unsigned int verdict = NF_DROP;
 	struct arphdr *arp;
-	int hotdrop = 0;
+	bool hotdrop = false;
 	struct arpt_entry *e, *back;
 	const char *indev, *outdev;
 	void *table_base;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 9bacf1a03630..e2a893825656 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -188,7 +188,7 @@ int do_match(struct ipt_entry_match *m,
 	     const struct net_device *in,
 	     const struct net_device *out,
 	     int offset,
-	     int *hotdrop)
+	     bool *hotdrop)
 {
 	/* Stop iteration if it doesn't match */
 	if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data,
@@ -216,7 +216,7 @@ ipt_do_table(struct sk_buff **pskb,
 	u_int16_t offset;
 	struct iphdr *ip;
 	u_int16_t datalen;
-	int hotdrop = 0;
+	bool hotdrop = false;
 	/* Initializing verdict to NF_DROP keeps gcc happy. */
 	unsigned int verdict = NF_DROP;
 	const char *indev, *outdev;
@@ -2122,7 +2122,7 @@ icmp_match(const struct sk_buff *skb,
 	   const void *matchinfo,
 	   int offset,
 	   unsigned int protoff,
-	   int *hotdrop)
+	   bool *hotdrop)
 {
 	struct icmphdr _icmph, *ic;
 	const struct ipt_icmp *icmpinfo = matchinfo;
@@ -2137,7 +2137,7 @@ icmp_match(const struct sk_buff *skb,
 		 * can't.  Hence, no choice but to drop.
 		 */
 		duprintf("Dropping evil ICMP tinygram.\n");
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
index a652a1451552..a9a9b750ff2d 100644
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -30,7 +30,7 @@ static inline int match_type(__be32 addr, u_int16_t mask)
 static int match(const struct sk_buff *skb,
 		 const struct net_device *in, const struct net_device *out,
 		 const struct xt_match *match, const void *matchinfo,
-		 int offset, unsigned int protoff, int *hotdrop)
+		 int offset, unsigned int protoff, bool *hotdrop)
 {
 	const struct ipt_addrtype_info *info = matchinfo;
 	const struct iphdr *iph = ip_hdr(skb);
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
index 18a16782cf40..9a244e406a48 100644
--- a/net/ipv4/netfilter/ipt_ah.c
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -44,7 +44,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	struct ip_auth_hdr _ahdr, *ah;
 	const struct ipt_ah *ahinfo = matchinfo;
@@ -60,7 +60,7 @@ match(const struct sk_buff *skb,
 		 * can't.  Hence, no choice but to drop.
 		 */
 		duprintf("Dropping evil AH tinygram.\n");
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index 26218122f865..a47f3745553b 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -30,7 +30,7 @@ static inline int match_ip(const struct sk_buff *skb,
 
 static inline int match_tcp(const struct sk_buff *skb,
 			    const struct ipt_ecn_info *einfo,
-			    int *hotdrop)
+			    bool *hotdrop)
 {
 	struct tcphdr _tcph, *th;
 
@@ -39,7 +39,7 @@ static inline int match_tcp(const struct sk_buff *skb,
 	 */
 	th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
 	if (th == NULL) {
-		*hotdrop = 0;
+		*hotdrop = false;
 		return 0;
 	}
 
@@ -69,7 +69,7 @@ static inline int match_tcp(const struct sk_buff *skb,
 static int match(const struct sk_buff *skb,
 		 const struct net_device *in, const struct net_device *out,
 		 const struct xt_match *match, const void *matchinfo,
-		 int offset, unsigned int protoff, int *hotdrop)
+		 int offset, unsigned int protoff, bool *hotdrop)
 {
 	const struct ipt_ecn_info *info = matchinfo;
 
diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c
index 33af9e940887..86f225c1d067 100644
--- a/net/ipv4/netfilter/ipt_iprange.c
+++ b/net/ipv4/netfilter/ipt_iprange.c
@@ -29,7 +29,7 @@ match(const struct sk_buff *skb,
       const struct net_device *out,
       const struct xt_match *match,
       const void *matchinfo,
-      int offset, unsigned int protoff, int *hotdrop)
+      int offset, unsigned int protoff, bool *hotdrop)
 {
 	const struct ipt_iprange_info *info = matchinfo;
 	const struct iphdr *iph = ip_hdr(skb);
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
index 7fae9aa8944c..92be562c4aca 100644
--- a/net/ipv4/netfilter/ipt_owner.c
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -29,7 +29,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	const struct ipt_owner_info *info = matchinfo;
 
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index 15a9e8bbb7cc..81f1a017f311 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -173,7 +173,7 @@ static int
 ipt_recent_match(const struct sk_buff *skb,
 		 const struct net_device *in, const struct net_device *out,
 		 const struct xt_match *match, const void *matchinfo,
-		 int offset, unsigned int protoff, int *hotdrop)
+		 int offset, unsigned int protoff, bool *hotdrop)
 {
 	const struct ipt_recent_info *info = matchinfo;
 	struct recent_table *t;
@@ -201,7 +201,7 @@ ipt_recent_match(const struct sk_buff *skb,
 			goto out;
 		e = recent_entry_init(t, addr, ttl);
 		if (e == NULL)
-			*hotdrop = 1;
+			*hotdrop = true;
 		ret ^= 1;
 		goto out;
 	}
diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c
index d314844af12b..803ed4c35b55 100644
--- a/net/ipv4/netfilter/ipt_tos.c
+++ b/net/ipv4/netfilter/ipt_tos.c
@@ -26,7 +26,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	const struct ipt_tos_info *info = matchinfo;
 
diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c
index ab02d9e3139c..e7316b27d2c5 100644
--- a/net/ipv4/netfilter/ipt_ttl.c
+++ b/net/ipv4/netfilter/ipt_ttl.c
@@ -21,7 +21,7 @@ MODULE_LICENSE("GPL");
 static int match(const struct sk_buff *skb,
 		 const struct net_device *in, const struct net_device *out,
 		 const struct xt_match *match, const void *matchinfo,
-		 int offset, unsigned int protoff, int *hotdrop)
+		 int offset, unsigned int protoff, bool *hotdrop)
 {
 	const struct ipt_ttl_info *info = matchinfo;
 	const u8 ttl = ip_hdr(skb)->ttl;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 9aa624026688..13c66a75c21c 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -102,7 +102,7 @@ ip6_packet_match(const struct sk_buff *skb,
 		 const char *outdev,
 		 const struct ip6t_ip6 *ip6info,
 		 unsigned int *protoff,
-		 int *fragoff, int *hotdrop)
+		 int *fragoff, bool *hotdrop)
 {
 	size_t i;
 	unsigned long ret;
@@ -162,7 +162,7 @@ ip6_packet_match(const struct sk_buff *skb,
 		protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off);
 		if (protohdr < 0) {
 			if (_frag_off == 0)
-				*hotdrop = 1;
+				*hotdrop = true;
 			return 0;
 		}
 		*fragoff = _frag_off;
@@ -225,7 +225,7 @@ int do_match(struct ip6t_entry_match *m,
 	     const struct net_device *out,
 	     int offset,
 	     unsigned int protoff,
-	     int *hotdrop)
+	     bool *hotdrop)
 {
 	/* Stop iteration if it doesn't match */
 	if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data,
@@ -252,7 +252,7 @@ ip6t_do_table(struct sk_buff **pskb,
 	static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
 	int offset = 0;
 	unsigned int protoff = 0;
-	int hotdrop = 0;
+	bool hotdrop = false;
 	/* Initializing verdict to NF_DROP keeps gcc happy. */
 	unsigned int verdict = NF_DROP;
 	const char *indev, *outdev;
@@ -1299,7 +1299,7 @@ icmp6_match(const struct sk_buff *skb,
 	   const void *matchinfo,
 	   int offset,
 	   unsigned int protoff,
-	   int *hotdrop)
+	   bool *hotdrop)
 {
 	struct icmp6hdr _icmp, *ic;
 	const struct ip6t_icmp *icmpinfo = matchinfo;
@@ -1313,7 +1313,7 @@ icmp6_match(const struct sk_buff *skb,
 		/* We've been asked to examine this packet, and we
 		   can't.  Hence, no choice but to drop. */
 		duprintf("Dropping evil ICMP tinygram.\n");
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index d3c154371b41..27b7bd279c0e 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -49,7 +49,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	struct ip_auth_hdr *ah, _ah;
 	const struct ip6t_ah *ahinfo = matchinfo;
@@ -60,13 +60,13 @@ match(const struct sk_buff *skb,
 	err = ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH, NULL);
 	if (err < 0) {
 		if (err != -ENOENT)
-			*hotdrop = 1;
+			*hotdrop = true;
 		return 0;
 	}
 
 	ah = skb_header_pointer(skb, ptr, sizeof(_ah), &_ah);
 	if (ah == NULL) {
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c
index 0f3dd932f0a6..69e79e19040e 100644
--- a/net/ipv6/netfilter/ip6t_eui64.c
+++ b/net/ipv6/netfilter/ip6t_eui64.c
@@ -27,7 +27,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	unsigned char eui64[8];
 	int i = 0;
@@ -35,7 +35,7 @@ match(const struct sk_buff *skb,
 	if (!(skb_mac_header(skb) >= skb->head &&
 	      (skb_mac_header(skb) + ETH_HLEN) <= skb->data) &&
 	    offset != 0) {
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c
index 5a5da71321b6..740fdcafa5f3 100644
--- a/net/ipv6/netfilter/ip6t_frag.c
+++ b/net/ipv6/netfilter/ip6t_frag.c
@@ -48,7 +48,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	struct frag_hdr _frag, *fh;
 	const struct ip6t_frag *fraginfo = matchinfo;
@@ -58,13 +58,13 @@ match(const struct sk_buff *skb,
 	err = ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL);
 	if (err < 0) {
 		if (err != -ENOENT)
-			*hotdrop = 1;
+			*hotdrop = true;
 		return 0;
 	}
 
 	fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag);
 	if (fh == NULL) {
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c
index d2373c7cd354..5633de160c6d 100644
--- a/net/ipv6/netfilter/ip6t_hbh.c
+++ b/net/ipv6/netfilter/ip6t_hbh.c
@@ -55,7 +55,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	struct ipv6_opt_hdr _optsh, *oh;
 	const struct ip6t_opts *optinfo = matchinfo;
@@ -71,13 +71,13 @@ match(const struct sk_buff *skb,
 	err = ipv6_find_hdr(skb, &ptr, match->data, NULL);
 	if (err < 0) {
 		if (err != -ENOENT)
-			*hotdrop = 1;
+			*hotdrop = true;
 		return 0;
 	}
 
 	oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh);
 	if (oh == NULL) {
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
diff --git a/net/ipv6/netfilter/ip6t_hl.c b/net/ipv6/netfilter/ip6t_hl.c
index d606c0e6d6fd..cbf49cffa067 100644
--- a/net/ipv6/netfilter/ip6t_hl.c
+++ b/net/ipv6/netfilter/ip6t_hl.c
@@ -22,7 +22,7 @@ MODULE_LICENSE("GPL");
 static int match(const struct sk_buff *skb,
 		 const struct net_device *in, const struct net_device *out,
 		 const struct xt_match *match, const void *matchinfo,
-		 int offset, unsigned int protoff, int *hotdrop)
+		 int offset, unsigned int protoff, bool *hotdrop)
 {
 	const struct ip6t_hl_info *info = matchinfo;
 	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c
index fd6a0869099b..469dec27c649 100644
--- a/net/ipv6/netfilter/ip6t_ipv6header.c
+++ b/net/ipv6/netfilter/ip6t_ipv6header.c
@@ -34,7 +34,7 @@ ipv6header_match(const struct sk_buff *skb,
 		 const void *matchinfo,
 		 int offset,
 		 unsigned int protoff,
-		 int *hotdrop)
+		 bool *hotdrop)
 {
 	const struct ip6t_ipv6header_info *info = matchinfo;
 	unsigned int temp;
diff --git a/net/ipv6/netfilter/ip6t_mh.c b/net/ipv6/netfilter/ip6t_mh.c
index c2a909893a64..c27647b6c274 100644
--- a/net/ipv6/netfilter/ip6t_mh.c
+++ b/net/ipv6/netfilter/ip6t_mh.c
@@ -48,7 +48,7 @@ match(const struct sk_buff *skb,
 	 const void *matchinfo,
 	 int offset,
 	 unsigned int protoff,
-	 int *hotdrop)
+	 bool *hotdrop)
 {
 	struct ip6_mh _mh, *mh;
 	const struct ip6t_mh *mhinfo = matchinfo;
@@ -62,14 +62,14 @@ match(const struct sk_buff *skb,
 		/* We've been asked to examine this packet, and we
 		   can't.  Hence, no choice but to drop. */
 		duprintf("Dropping evil MH tinygram.\n");
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
 	if (mh->ip6mh_proto != IPPROTO_NONE) {
 		duprintf("Dropping invalid MH Payload Proto: %u\n",
 			 mh->ip6mh_proto);
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c
index 43738bba00b5..f90f7c32cc9e 100644
--- a/net/ipv6/netfilter/ip6t_owner.c
+++ b/net/ipv6/netfilter/ip6t_owner.c
@@ -31,7 +31,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	const struct ip6t_owner_info *info = matchinfo;
 
diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c
index 81ab00d8c182..2bb88214cfda 100644
--- a/net/ipv6/netfilter/ip6t_rt.c
+++ b/net/ipv6/netfilter/ip6t_rt.c
@@ -50,7 +50,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	struct ipv6_rt_hdr _route, *rh;
 	const struct ip6t_rt *rtinfo = matchinfo;
@@ -64,13 +64,13 @@ match(const struct sk_buff *skb,
 	err = ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING, NULL);
 	if (err < 0) {
 		if (err != -ENOENT)
-			*hotdrop = 1;
+			*hotdrop = true;
 		return 0;
 	}
 
 	rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route);
 	if (rh == NULL) {
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c
index 7db492d65220..20690ea0d466 100644
--- a/net/netfilter/xt_comment.c
+++ b/net/netfilter/xt_comment.c
@@ -23,7 +23,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protooff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	/* We always match */
 	return 1;
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index 804afe55e141..8fe5775901e1 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -23,7 +23,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	const struct xt_connbytes_info *sinfo = matchinfo;
 	struct nf_conn *ct;
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index e1803256c792..8a6d58ab5d2b 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -38,7 +38,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	const struct xt_connmark_info *info = matchinfo;
 	struct nf_conn *ct;
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 189ded5f378b..915c730d3b72 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -27,7 +27,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	const struct xt_conntrack_info *sinfo = matchinfo;
 	struct nf_conn *ct;
diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c
index 2c9c0dee8aaf..3172e7308b35 100644
--- a/net/netfilter/xt_dccp.c
+++ b/net/netfilter/xt_dccp.c
@@ -36,7 +36,7 @@ dccp_find_option(u_int8_t option,
 		 const struct sk_buff *skb,
 		 unsigned int protoff,
 		 const struct dccp_hdr *dh,
-		 int *hotdrop)
+		 bool *hotdrop)
 {
 	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
 	unsigned char *op;
@@ -45,7 +45,7 @@ dccp_find_option(u_int8_t option,
 	unsigned int i;
 
 	if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) {
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
@@ -57,7 +57,7 @@ dccp_find_option(u_int8_t option,
 	if (op == NULL) {
 		/* If we don't have the whole header, drop packet. */
 		spin_unlock_bh(&dccp_buflock);
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
@@ -86,7 +86,7 @@ match_types(const struct dccp_hdr *dh, u_int16_t typemask)
 
 static inline int
 match_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff,
-	     const struct dccp_hdr *dh, int *hotdrop)
+	     const struct dccp_hdr *dh, bool *hotdrop)
 {
 	return dccp_find_option(option, skb, protoff, dh, hotdrop);
 }
@@ -99,7 +99,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	const struct xt_dccp_info *info = matchinfo;
 	struct dccp_hdr _dh, *dh;
@@ -109,7 +109,7 @@ match(const struct sk_buff *skb,
 
 	dh = skb_header_pointer(skb, protoff, sizeof(_dh), &_dh);
 	if (dh == NULL) {
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c
index 56b247ecc283..c106d738da6d 100644
--- a/net/netfilter/xt_dscp.c
+++ b/net/netfilter/xt_dscp.c
@@ -29,7 +29,7 @@ static int match(const struct sk_buff *skb,
 		 const void *matchinfo,
 		 int offset,
 		 unsigned int protoff,
-		 int *hotdrop)
+		 bool *hotdrop)
 {
 	const struct xt_dscp_info *info = matchinfo;
 	u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
@@ -44,7 +44,7 @@ static int match6(const struct sk_buff *skb,
 		  const void *matchinfo,
 		  int offset,
 		  unsigned int protoff,
-		  int *hotdrop)
+		  bool *hotdrop)
 {
 	const struct xt_dscp_info *info = matchinfo;
 	u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c
index 7c95f149d942..5d3421bcd850 100644
--- a/net/netfilter/xt_esp.c
+++ b/net/netfilter/xt_esp.c
@@ -50,7 +50,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	struct ip_esp_hdr _esp, *eh;
 	const struct xt_esp *espinfo = matchinfo;
@@ -65,7 +65,7 @@ match(const struct sk_buff *skb,
 		 * can't.  Hence, no choice but to drop.
 		 */
 		duprintf("Dropping evil ESP tinygram.\n");
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index d3043fa32ebc..cd5cba6978c3 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -440,7 +440,7 @@ hashlimit_match(const struct sk_buff *skb,
 		const void *matchinfo,
 		int offset,
 		unsigned int protoff,
-		int *hotdrop)
+		bool *hotdrop)
 {
 	struct xt_hashlimit_info *r =
 		((struct xt_hashlimit_info *)matchinfo)->u.master;
@@ -487,7 +487,7 @@ hashlimit_match(const struct sk_buff *skb,
 	return 0;
 
 hotdrop:
-	*hotdrop = 1;
+	*hotdrop = true;
 	return 0;
 }
 
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index c139b2f43a10..0aa090776e27 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -36,7 +36,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	const struct xt_helper_info *info = matchinfo;
 	struct nf_conn *ct;
diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c
index 77288c5ada78..621c9ee6d1c9 100644
--- a/net/netfilter/xt_length.c
+++ b/net/netfilter/xt_length.c
@@ -28,7 +28,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	const struct xt_length_info *info = matchinfo;
 	u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len);
@@ -44,7 +44,7 @@ match6(const struct sk_buff *skb,
        const void *matchinfo,
        int offset,
        unsigned int protoff,
-       int *hotdrop)
+       bool *hotdrop)
 {
 	const struct xt_length_info *info = matchinfo;
 	const u_int16_t pktlen = (ntohs(ipv6_hdr(skb)->payload_len) +
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index 571a72ab89ad..1133b4ca4904 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -65,7 +65,7 @@ ipt_limit_match(const struct sk_buff *skb,
 		const void *matchinfo,
 		int offset,
 		unsigned int protoff,
-		int *hotdrop)
+		bool *hotdrop)
 {
 	struct xt_rateinfo *r = ((struct xt_rateinfo *)matchinfo)->master;
 	unsigned long now = jiffies;
diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c
index 1d3a1d98b885..0e6a28647206 100644
--- a/net/netfilter/xt_mac.c
+++ b/net/netfilter/xt_mac.c
@@ -32,7 +32,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
     const struct xt_mac_info *info = matchinfo;
 
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
index 39911dddb011..944d1ea56029 100644
--- a/net/netfilter/xt_mark.c
+++ b/net/netfilter/xt_mark.c
@@ -27,7 +27,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	const struct xt_mark_info *info = matchinfo;
 
diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c
index 4dce2a81702a..1dc53ded9887 100644
--- a/net/netfilter/xt_multiport.c
+++ b/net/netfilter/xt_multiport.c
@@ -102,7 +102,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	__be16 _ports[2], *pptr;
 	const struct xt_multiport *multiinfo = matchinfo;
@@ -116,7 +116,7 @@ match(const struct sk_buff *skb,
 		 * can't.  Hence, no choice but to drop.
 		 */
 		duprintf("xt_multiport: Dropping evil offset=0 tinygram.\n");
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
@@ -133,7 +133,7 @@ match_v1(const struct sk_buff *skb,
 	 const void *matchinfo,
 	 int offset,
 	 unsigned int protoff,
-	 int *hotdrop)
+	 bool *hotdrop)
 {
 	__be16 _ports[2], *pptr;
 	const struct xt_multiport_v1 *multiinfo = matchinfo;
@@ -147,7 +147,7 @@ match_v1(const struct sk_buff *skb,
 		 * can't.  Hence, no choice but to drop.
 		 */
 		duprintf("xt_multiport: Dropping evil offset=0 tinygram.\n");
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index 35a0fe200c39..a6de512fa840 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -31,7 +31,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	int i;
 	static const char nulldevname[IFNAMSIZ];
diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c
index e1409fc5c288..692581f40c5f 100644
--- a/net/netfilter/xt_pkttype.c
+++ b/net/netfilter/xt_pkttype.c
@@ -28,7 +28,7 @@ static int match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	u_int8_t type;
 	const struct xt_pkttype_info *info = matchinfo;
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index 15b45a95ec13..6878482cd527 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -115,7 +115,7 @@ static int match(const struct sk_buff *skb,
 		 const void *matchinfo,
 		 int offset,
 		 unsigned int protoff,
-		 int *hotdrop)
+		 bool *hotdrop)
 {
 	const struct xt_policy_info *info = matchinfo;
 	int ret;
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index bfdde06ca0b7..53c71ac980fc 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -20,7 +20,7 @@ static int
 match(const struct sk_buff *skb,
       const struct net_device *in, const struct net_device *out,
       const struct xt_match *match, const void *matchinfo,
-      int offset, unsigned int protoff, int *hotdrop)
+      int offset, unsigned int protoff, bool *hotdrop)
 {
 	struct xt_quota_info *q = ((struct xt_quota_info *)matchinfo)->master;
 	int ret = q->flags & XT_QUOTA_INVERT ? 1 : 0;
diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c
index c2017f8af9c4..41451f57919c 100644
--- a/net/netfilter/xt_realm.c
+++ b/net/netfilter/xt_realm.c
@@ -29,7 +29,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	const struct xt_realm_info *info = matchinfo;
 	struct dst_entry *dst = skb->dst;
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index f86d8d769d47..e581afe89098 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -47,7 +47,7 @@ match_packet(const struct sk_buff *skb,
 	     int chunk_match_type,
 	     const struct xt_sctp_flag_info *flag_info,
 	     const int flag_count,
-	     int *hotdrop)
+	     bool *hotdrop)
 {
 	u_int32_t chunkmapcopy[256 / sizeof (u_int32_t)];
 	sctp_chunkhdr_t _sch, *sch;
@@ -64,7 +64,7 @@ match_packet(const struct sk_buff *skb,
 		sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch);
 		if (sch == NULL || sch->length == 0) {
 			duprintf("Dropping invalid SCTP packet.\n");
-			*hotdrop = 1;
+			*hotdrop = true;
 			return 0;
 		}
 
@@ -127,7 +127,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	const struct xt_sctp_info *info = matchinfo;
 	sctp_sctphdr_t _sh, *sh;
@@ -140,7 +140,7 @@ match(const struct sk_buff *skb,
 	sh = skb_header_pointer(skb, protoff, sizeof(_sh), &_sh);
 	if (sh == NULL) {
 		duprintf("Dropping evil TCP offset=0 tinygram.\n");
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 	duprintf("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest));
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index 149294f7df71..74fe069fc3aa 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -28,7 +28,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	const struct xt_state_info *sinfo = matchinfo;
 	enum ip_conntrack_info ctinfo;
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index 091a9f89f5d5..4e5ed81e9ce1 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -28,7 +28,7 @@ static int
 match(const struct sk_buff *skb,
       const struct net_device *in, const struct net_device *out,
       const struct xt_match *match, const void *matchinfo,
-      int offset, unsigned int protoff, int *hotdrop)
+      int offset, unsigned int protoff, bool *hotdrop)
 {
 	struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo;
 	int ret = info->flags & XT_STATISTIC_INVERT ? 1 : 0;
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index 999a005dbd0c..7552d8927570 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -28,7 +28,7 @@ static int match(const struct sk_buff *skb,
 		 const void *matchinfo,
 		 int offset,
 		 unsigned int protoff,
-		 int *hotdrop)
+		 bool *hotdrop)
 {
 	const struct xt_string_info *conf = matchinfo;
 	struct ts_state state;
diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c
index 80571d0749f7..0db4f5362180 100644
--- a/net/netfilter/xt_tcpmss.c
+++ b/net/netfilter/xt_tcpmss.c
@@ -31,7 +31,7 @@ match(const struct sk_buff *skb,
       const void *matchinfo,
       int offset,
       unsigned int protoff,
-      int *hotdrop)
+      bool *hotdrop)
 {
 	const struct xt_tcpmss_match_info *info = matchinfo;
 	struct tcphdr _tcph, *th;
@@ -77,7 +77,7 @@ out:
 	return info->invert;
 
 dropit:
-	*hotdrop = 1;
+	*hotdrop = true;
 	return 0;
 }
 
diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c
index 46414b562a19..ca9ccdd931bc 100644
--- a/net/netfilter/xt_tcpudp.c
+++ b/net/netfilter/xt_tcpudp.c
@@ -42,7 +42,7 @@ tcp_find_option(u_int8_t option,
 		unsigned int protoff,
 		unsigned int optlen,
 		int invert,
-		int *hotdrop)
+		bool *hotdrop)
 {
 	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
 	u_int8_t _opt[60 - sizeof(struct tcphdr)], *op;
@@ -57,7 +57,7 @@ tcp_find_option(u_int8_t option,
 	op = skb_header_pointer(skb, protoff + sizeof(struct tcphdr),
 				optlen, _opt);
 	if (op == NULL) {
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
@@ -78,7 +78,7 @@ tcp_match(const struct sk_buff *skb,
 	  const void *matchinfo,
 	  int offset,
 	  unsigned int protoff,
-	  int *hotdrop)
+	  bool *hotdrop)
 {
 	struct tcphdr _tcph, *th;
 	const struct xt_tcp *tcpinfo = matchinfo;
@@ -92,7 +92,7 @@ tcp_match(const struct sk_buff *skb,
 		*/
 		if (offset == 1) {
 			duprintf("Dropping evil TCP offset=1 frag.\n");
-			*hotdrop = 1;
+			*hotdrop = true;
 		}
 		/* Must not be a fragment. */
 		return 0;
@@ -105,7 +105,7 @@ tcp_match(const struct sk_buff *skb,
 		/* We've been asked to examine this packet, and we
 		   can't.  Hence, no choice but to drop. */
 		duprintf("Dropping evil TCP offset=0 tinygram.\n");
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
@@ -123,7 +123,7 @@ tcp_match(const struct sk_buff *skb,
 		return 0;
 	if (tcpinfo->option) {
 		if (th->doff * 4 < sizeof(_tcph)) {
-			*hotdrop = 1;
+			*hotdrop = true;
 			return 0;
 		}
 		if (!tcp_find_option(tcpinfo->option, skb, protoff,
@@ -157,7 +157,7 @@ udp_match(const struct sk_buff *skb,
 	  const void *matchinfo,
 	  int offset,
 	  unsigned int protoff,
-	  int *hotdrop)
+	  bool *hotdrop)
 {
 	struct udphdr _udph, *uh;
 	const struct xt_udp *udpinfo = matchinfo;
@@ -171,7 +171,7 @@ udp_match(const struct sk_buff *skb,
 		/* We've been asked to examine this packet, and we
 		   can't.  Hence, no choice but to drop. */
 		duprintf("Dropping evil UDP tinygram.\n");
-		*hotdrop = 1;
+		*hotdrop = true;
 		return 0;
 	}
 
-- 
cgit v1.3-8-gc7d7


From 1d93a9cbad608f6398ba6c5b588c504ccd35a2ca Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@gmx.de>
Date: Sat, 7 Jul 2007 22:15:35 -0700
Subject: [NETFILTER]: x_tables: switch xt_match->match to bool

Switch the return type of match functions to boolean

Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h   | 16 +++++++-------
 net/ipv4/netfilter/ip_tables.c       | 26 +++++++++++-----------
 net/ipv4/netfilter/ipt_addrtype.c    | 12 +++++------
 net/ipv4/netfilter/ipt_ah.c          | 10 ++++-----
 net/ipv4/netfilter/ipt_ecn.c         | 38 ++++++++++++++++----------------
 net/ipv4/netfilter/ipt_iprange.c     |  8 +++----
 net/ipv4/netfilter/ipt_owner.c       | 10 ++++-----
 net/ipv4/netfilter/ipt_recent.c      | 12 +++++------
 net/ipv4/netfilter/ipt_tos.c         |  2 +-
 net/ipv4/netfilter/ipt_ttl.c         | 12 +++++------
 net/ipv6/netfilter/ip6_tables.c      | 42 ++++++++++++++++++------------------
 net/ipv6/netfilter/ip6t_ah.c         | 12 +++++------
 net/ipv6/netfilter/ip6t_eui64.c      |  8 +++----
 net/ipv6/netfilter/ip6t_frag.c       | 12 +++++------
 net/ipv6/netfilter/ip6t_hbh.c        | 18 ++++++++--------
 net/ipv6/netfilter/ip6t_hl.c         | 12 +++++------
 net/ipv6/netfilter/ip6t_ipv6header.c |  6 +++---
 net/ipv6/netfilter/ip6t_mh.c         | 17 ++++++---------
 net/ipv6/netfilter/ip6t_owner.c      | 10 ++++-----
 net/ipv6/netfilter/ip6t_rt.c         | 26 +++++++++++-----------
 net/netfilter/xt_comment.c           |  4 ++--
 net/netfilter/xt_connbytes.c         |  4 ++--
 net/netfilter/xt_connmark.c          |  4 ++--
 net/netfilter/xt_conntrack.c         | 24 ++++++++++-----------
 net/netfilter/xt_dccp.c              | 22 +++++++++----------
 net/netfilter/xt_dscp.c              | 32 +++++++++++++--------------
 net/netfilter/xt_esp.c               | 12 +++++------
 net/netfilter/xt_hashlimit.c         | 17 ++++++++-------
 net/netfilter/xt_helper.c            |  6 +++---
 net/netfilter/xt_length.c            |  4 ++--
 net/netfilter/xt_limit.c             |  6 +++---
 net/netfilter/xt_mac.c               |  2 +-
 net/netfilter/xt_mark.c              |  2 +-
 net/netfilter/xt_multiport.c         | 34 ++++++++++++++---------------
 net/netfilter/xt_physdev.c           | 34 ++++++++++++++---------------
 net/netfilter/xt_pkttype.c           |  2 +-
 net/netfilter/xt_policy.c            | 26 +++++++++++-----------
 net/netfilter/xt_quota.c             |  6 +++---
 net/netfilter/xt_realm.c             |  2 +-
 net/netfilter/xt_sctp.c              | 26 +++++++++++-----------
 net/netfilter/xt_state.c             |  2 +-
 net/netfilter/xt_statistic.c         |  8 +++----
 net/netfilter/xt_string.c            | 16 +++++++-------
 net/netfilter/xt_tcpmss.c            |  4 ++--
 net/netfilter/xt_tcpudp.c            | 39 ++++++++++++++++-----------------
 45 files changed, 320 insertions(+), 327 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index b8577d18d10d..304fce356a43 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -141,14 +141,14 @@ struct xt_match
 	/* Arguments changed since 2.6.9, as this must now handle
 	   non-linear skb, using skb_header_pointer and
 	   skb_ip_make_writable. */
-	int (*match)(const struct sk_buff *skb,
-		     const struct net_device *in,
-		     const struct net_device *out,
-		     const struct xt_match *match,
-		     const void *matchinfo,
-		     int offset,
-		     unsigned int protoff,
-		     bool *hotdrop);
+	bool (*match)(const struct sk_buff *skb,
+		      const struct net_device *in,
+		      const struct net_device *out,
+		      const struct xt_match *match,
+		      const void *matchinfo,
+		      int offset,
+		      unsigned int protoff,
+		      bool *hotdrop);
 
 	/* Called when user tries to insert an entry of this type. */
 	/* Should return true or false. */
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index e2a893825656..b9c792dd4890 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -183,19 +183,19 @@ ipt_error(struct sk_buff **pskb,
 }
 
 static inline
-int do_match(struct ipt_entry_match *m,
-	     const struct sk_buff *skb,
-	     const struct net_device *in,
-	     const struct net_device *out,
-	     int offset,
-	     bool *hotdrop)
+bool do_match(struct ipt_entry_match *m,
+	      const struct sk_buff *skb,
+	      const struct net_device *in,
+	      const struct net_device *out,
+	      int offset,
+	      bool *hotdrop)
 {
 	/* Stop iteration if it doesn't match */
 	if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data,
 				      offset, ip_hdrlen(skb), hotdrop))
-		return 1;
+		return true;
 	else
-		return 0;
+		return false;
 }
 
 static inline struct ipt_entry *
@@ -2105,16 +2105,16 @@ void ipt_unregister_table(struct xt_table *table)
 }
 
 /* Returns 1 if the type and code is matched by the range, 0 otherwise */
-static inline int
+static inline bool
 icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
 		     u_int8_t type, u_int8_t code,
-		     int invert)
+		     bool invert)
 {
 	return ((test_type == 0xFF) || (type == test_type && code >= min_code && code <= max_code))
 		^ invert;
 }
 
-static int
+static bool
 icmp_match(const struct sk_buff *skb,
 	   const struct net_device *in,
 	   const struct net_device *out,
@@ -2129,7 +2129,7 @@ icmp_match(const struct sk_buff *skb,
 
 	/* Must not be a fragment. */
 	if (offset)
-		return 0;
+		return false;
 
 	ic = skb_header_pointer(skb, protoff, sizeof(_icmph), &_icmph);
 	if (ic == NULL) {
@@ -2138,7 +2138,7 @@ icmp_match(const struct sk_buff *skb,
 		 */
 		duprintf("Dropping evil ICMP tinygram.\n");
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	return icmp_type_code_match(icmpinfo->type,
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
index a9a9b750ff2d..abea446a4437 100644
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -22,19 +22,19 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
 MODULE_DESCRIPTION("iptables addrtype match");
 
-static inline int match_type(__be32 addr, u_int16_t mask)
+static inline bool match_type(__be32 addr, u_int16_t mask)
 {
 	return !!(mask & (1 << inet_addr_type(addr)));
 }
 
-static int match(const struct sk_buff *skb,
-		 const struct net_device *in, const struct net_device *out,
-		 const struct xt_match *match, const void *matchinfo,
-		 int offset, unsigned int protoff, bool *hotdrop)
+static bool match(const struct sk_buff *skb,
+		  const struct net_device *in, const struct net_device *out,
+		  const struct xt_match *match, const void *matchinfo,
+		  int offset, unsigned int protoff, bool *hotdrop)
 {
 	const struct ipt_addrtype_info *info = matchinfo;
 	const struct iphdr *iph = ip_hdr(skb);
-	int ret = 1;
+	bool ret = true;
 
 	if (info->source)
 		ret &= match_type(iph->saddr, info->source)^info->invert_source;
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
index 9a244e406a48..3da39ee92d8b 100644
--- a/net/ipv4/netfilter/ipt_ah.c
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -25,10 +25,10 @@ MODULE_DESCRIPTION("iptables AH SPI match module");
 #endif
 
 /* Returns 1 if the spi is matched by the range, 0 otherwise */
-static inline int
-spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert)
+static inline bool
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
 {
-	int r=0;
+	bool r;
 	duprintf("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ',
 		min,spi,max);
 	r=(spi >= min && spi <= max) ^ invert;
@@ -36,7 +36,7 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert)
 	return r;
 }
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -51,7 +51,7 @@ match(const struct sk_buff *skb,
 
 	/* Must not be a fragment. */
 	if (offset)
-		return 0;
+		return false;
 
 	ah = skb_header_pointer(skb, protoff,
 				sizeof(_ahdr), &_ahdr);
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index a47f3745553b..ba3a17e0f848 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -22,15 +22,15 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
 MODULE_DESCRIPTION("iptables ECN matching module");
 MODULE_LICENSE("GPL");
 
-static inline int match_ip(const struct sk_buff *skb,
-			   const struct ipt_ecn_info *einfo)
+static inline bool match_ip(const struct sk_buff *skb,
+			    const struct ipt_ecn_info *einfo)
 {
 	return (ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect;
 }
 
-static inline int match_tcp(const struct sk_buff *skb,
-			    const struct ipt_ecn_info *einfo,
-			    bool *hotdrop)
+static inline bool match_tcp(const struct sk_buff *skb,
+			     const struct ipt_ecn_info *einfo,
+			     bool *hotdrop)
 {
 	struct tcphdr _tcph, *th;
 
@@ -40,51 +40,51 @@ static inline int match_tcp(const struct sk_buff *skb,
 	th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
 	if (th == NULL) {
 		*hotdrop = false;
-		return 0;
+		return false;
 	}
 
 	if (einfo->operation & IPT_ECN_OP_MATCH_ECE) {
 		if (einfo->invert & IPT_ECN_OP_MATCH_ECE) {
 			if (th->ece == 1)
-				return 0;
+				return false;
 		} else {
 			if (th->ece == 0)
-				return 0;
+				return false;
 		}
 	}
 
 	if (einfo->operation & IPT_ECN_OP_MATCH_CWR) {
 		if (einfo->invert & IPT_ECN_OP_MATCH_CWR) {
 			if (th->cwr == 1)
-				return 0;
+				return false;
 		} else {
 			if (th->cwr == 0)
-				return 0;
+				return false;
 		}
 	}
 
-	return 1;
+	return true;
 }
 
-static int match(const struct sk_buff *skb,
-		 const struct net_device *in, const struct net_device *out,
-		 const struct xt_match *match, const void *matchinfo,
-		 int offset, unsigned int protoff, bool *hotdrop)
+static bool match(const struct sk_buff *skb,
+		  const struct net_device *in, const struct net_device *out,
+		  const struct xt_match *match, const void *matchinfo,
+		  int offset, unsigned int protoff, bool *hotdrop)
 {
 	const struct ipt_ecn_info *info = matchinfo;
 
 	if (info->operation & IPT_ECN_OP_MATCH_IP)
 		if (!match_ip(skb, info))
-			return 0;
+			return false;
 
 	if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) {
 		if (ip_hdr(skb)->protocol != IPPROTO_TCP)
-			return 0;
+			return false;
 		if (!match_tcp(skb, info, hotdrop))
-			return 0;
+			return false;
 	}
 
-	return 1;
+	return true;
 }
 
 static int checkentry(const char *tablename, const void *ip_void,
diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c
index 86f225c1d067..b266d98aac8c 100644
--- a/net/ipv4/netfilter/ipt_iprange.c
+++ b/net/ipv4/netfilter/ipt_iprange.c
@@ -23,7 +23,7 @@ MODULE_DESCRIPTION("iptables arbitrary IP range match module");
 #define DEBUGP(format, args...)
 #endif
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -44,7 +44,7 @@ match(const struct sk_buff *skb,
 				info->flags & IPRANGE_SRC_INV ? "(INV) " : "",
 				NIPQUAD(info->src.min_ip),
 				NIPQUAD(info->src.max_ip));
-			return 0;
+			return false;
 		}
 	}
 	if (info->flags & IPRANGE_DST) {
@@ -57,10 +57,10 @@ match(const struct sk_buff *skb,
 				info->flags & IPRANGE_DST_INV ? "(INV) " : "",
 				NIPQUAD(info->dst.min_ip),
 				NIPQUAD(info->dst.max_ip));
-			return 0;
+			return false;
 		}
 	}
-	return 1;
+	return true;
 }
 
 static struct xt_match iprange_match = {
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
index 92be562c4aca..8f441cef5504 100644
--- a/net/ipv4/netfilter/ipt_owner.c
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -21,7 +21,7 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
 MODULE_DESCRIPTION("iptables owner match");
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -34,21 +34,21 @@ match(const struct sk_buff *skb,
 	const struct ipt_owner_info *info = matchinfo;
 
 	if (!skb->sk || !skb->sk->sk_socket || !skb->sk->sk_socket->file)
-		return 0;
+		return false;
 
 	if(info->match & IPT_OWNER_UID) {
 		if ((skb->sk->sk_socket->file->f_uid != info->uid) ^
 		    !!(info->invert & IPT_OWNER_UID))
-			return 0;
+			return false;
 	}
 
 	if(info->match & IPT_OWNER_GID) {
 		if ((skb->sk->sk_socket->file->f_gid != info->gid) ^
 		    !!(info->invert & IPT_OWNER_GID))
-			return 0;
+			return false;
 	}
 
-	return 1;
+	return true;
 }
 
 static int
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index 81f1a017f311..2e513ed9b6e9 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -169,7 +169,7 @@ static void recent_table_flush(struct recent_table *t)
 	}
 }
 
-static int
+static bool
 ipt_recent_match(const struct sk_buff *skb,
 		 const struct net_device *in, const struct net_device *out,
 		 const struct xt_match *match, const void *matchinfo,
@@ -180,7 +180,7 @@ ipt_recent_match(const struct sk_buff *skb,
 	struct recent_entry *e;
 	__be32 addr;
 	u_int8_t ttl;
-	int ret = info->invert;
+	bool ret = info->invert;
 
 	if (info->side == IPT_RECENT_DEST)
 		addr = ip_hdr(skb)->daddr;
@@ -202,15 +202,15 @@ ipt_recent_match(const struct sk_buff *skb,
 		e = recent_entry_init(t, addr, ttl);
 		if (e == NULL)
 			*hotdrop = true;
-		ret ^= 1;
+		ret = !ret;
 		goto out;
 	}
 
 	if (info->check_set & IPT_RECENT_SET)
-		ret ^= 1;
+		ret = !ret;
 	else if (info->check_set & IPT_RECENT_REMOVE) {
 		recent_entry_remove(t, e);
-		ret ^= 1;
+		ret = !ret;
 	} else if (info->check_set & (IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) {
 		unsigned long t = jiffies - info->seconds * HZ;
 		unsigned int i, hits = 0;
@@ -219,7 +219,7 @@ ipt_recent_match(const struct sk_buff *skb,
 			if (info->seconds && time_after(t, e->stamps[i]))
 				continue;
 			if (++hits >= info->hit_count) {
-				ret ^= 1;
+				ret = !ret;
 				break;
 			}
 		}
diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c
index 803ed4c35b55..67699ae46d37 100644
--- a/net/ipv4/netfilter/ipt_tos.c
+++ b/net/ipv4/netfilter/ipt_tos.c
@@ -18,7 +18,7 @@
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("iptables TOS match module");
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c
index e7316b27d2c5..82fe4ea8ab79 100644
--- a/net/ipv4/netfilter/ipt_ttl.c
+++ b/net/ipv4/netfilter/ipt_ttl.c
@@ -18,10 +18,10 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
 MODULE_DESCRIPTION("IP tables TTL matching module");
 MODULE_LICENSE("GPL");
 
-static int match(const struct sk_buff *skb,
-		 const struct net_device *in, const struct net_device *out,
-		 const struct xt_match *match, const void *matchinfo,
-		 int offset, unsigned int protoff, bool *hotdrop)
+static bool match(const struct sk_buff *skb,
+		  const struct net_device *in, const struct net_device *out,
+		  const struct xt_match *match, const void *matchinfo,
+		  int offset, unsigned int protoff, bool *hotdrop)
 {
 	const struct ipt_ttl_info *info = matchinfo;
 	const u8 ttl = ip_hdr(skb)->ttl;
@@ -42,10 +42,10 @@ static int match(const struct sk_buff *skb,
 		default:
 			printk(KERN_WARNING "ipt_ttl: unknown mode %d\n",
 				info->mode);
-			return 0;
+			return false;
 	}
 
-	return 0;
+	return false;
 }
 
 static struct xt_match ttl_match = {
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 13c66a75c21c..31f42e82184a 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -96,7 +96,7 @@ ip6t_ext_hdr(u8 nexthdr)
 }
 
 /* Returns whether matches rule or not. */
-static inline int
+static inline bool
 ip6_packet_match(const struct sk_buff *skb,
 		 const char *indev,
 		 const char *outdev,
@@ -122,7 +122,7 @@ ip6_packet_match(const struct sk_buff *skb,
 		dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr,
 			ipinfo->dmsk.s_addr, ipinfo->dst.s_addr,
 			ipinfo->invflags & IP6T_INV_DSTIP ? " (INV)" : "");*/
-		return 0;
+		return false;
 	}
 
 	/* Look for ifname matches; this should unroll nicely. */
@@ -136,7 +136,7 @@ ip6_packet_match(const struct sk_buff *skb,
 		dprintf("VIA in mismatch (%s vs %s).%s\n",
 			indev, ip6info->iniface,
 			ip6info->invflags&IP6T_INV_VIA_IN ?" (INV)":"");
-		return 0;
+		return false;
 	}
 
 	for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
@@ -149,7 +149,7 @@ ip6_packet_match(const struct sk_buff *skb,
 		dprintf("VIA out mismatch (%s vs %s).%s\n",
 			outdev, ip6info->outiface,
 			ip6info->invflags&IP6T_INV_VIA_OUT ?" (INV)":"");
-		return 0;
+		return false;
 	}
 
 /* ... might want to do something with class and flowlabel here ... */
@@ -163,7 +163,7 @@ ip6_packet_match(const struct sk_buff *skb,
 		if (protohdr < 0) {
 			if (_frag_off == 0)
 				*hotdrop = true;
-			return 0;
+			return false;
 		}
 		*fragoff = _frag_off;
 
@@ -174,17 +174,17 @@ ip6_packet_match(const struct sk_buff *skb,
 
 		if (ip6info->proto == protohdr) {
 			if(ip6info->invflags & IP6T_INV_PROTO) {
-				return 0;
+				return false;
 			}
-			return 1;
+			return true;
 		}
 
 		/* We need match for the '-p all', too! */
 		if ((ip6info->proto != 0) &&
 			!(ip6info->invflags & IP6T_INV_PROTO))
-			return 0;
+			return false;
 	}
-	return 1;
+	return true;
 }
 
 /* should be ip6 safe */
@@ -219,20 +219,20 @@ ip6t_error(struct sk_buff **pskb,
 }
 
 static inline
-int do_match(struct ip6t_entry_match *m,
-	     const struct sk_buff *skb,
-	     const struct net_device *in,
-	     const struct net_device *out,
-	     int offset,
-	     unsigned int protoff,
-	     bool *hotdrop)
+bool do_match(struct ip6t_entry_match *m,
+	      const struct sk_buff *skb,
+	      const struct net_device *in,
+	      const struct net_device *out,
+	      int offset,
+	      unsigned int protoff,
+	      bool *hotdrop)
 {
 	/* Stop iteration if it doesn't match */
 	if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data,
 				      offset, protoff, hotdrop))
-		return 1;
+		return true;
 	else
-		return 0;
+		return false;
 }
 
 static inline struct ip6t_entry *
@@ -1291,7 +1291,7 @@ icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
 		^ invert;
 }
 
-static int
+static bool
 icmp6_match(const struct sk_buff *skb,
 	   const struct net_device *in,
 	   const struct net_device *out,
@@ -1306,7 +1306,7 @@ icmp6_match(const struct sk_buff *skb,
 
 	/* Must not be a fragment. */
 	if (offset)
-		return 0;
+		return false;
 
 	ic = skb_header_pointer(skb, protoff, sizeof(_icmp), &_icmp);
 	if (ic == NULL) {
@@ -1314,7 +1314,7 @@ icmp6_match(const struct sk_buff *skb,
 		   can't.  Hence, no choice but to drop. */
 		duprintf("Dropping evil ICMP tinygram.\n");
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	return icmp6_type_code_match(icmpinfo->type,
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index 27b7bd279c0e..607c2eb1296f 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -30,10 +30,10 @@ MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
 #endif
 
 /* Returns 1 if the spi is matched by the range, 0 otherwise */
-static inline int
-spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert)
+static inline bool
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
 {
-	int r=0;
+	bool r;
 	DEBUGP("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ',
 	       min,spi,max);
 	r = (spi >= min && spi <= max) ^ invert;
@@ -41,7 +41,7 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert)
 	return r;
 }
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -61,13 +61,13 @@ match(const struct sk_buff *skb,
 	if (err < 0) {
 		if (err != -ENOENT)
 			*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	ah = skb_header_pointer(skb, ptr, sizeof(_ah), &_ah);
 	if (ah == NULL) {
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	hdrlen = (ah->hdrlen + 2) << 2;
diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c
index 69e79e19040e..bebb12a1d0e6 100644
--- a/net/ipv6/netfilter/ip6t_eui64.c
+++ b/net/ipv6/netfilter/ip6t_eui64.c
@@ -19,7 +19,7 @@ MODULE_DESCRIPTION("IPv6 EUI64 address checking match");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -36,7 +36,7 @@ match(const struct sk_buff *skb,
 	      (skb_mac_header(skb) + ETH_HLEN) <= skb->data) &&
 	    offset != 0) {
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	memset(eui64, 0, sizeof(eui64));
@@ -55,11 +55,11 @@ match(const struct sk_buff *skb,
 				i++;
 
 			if (i == 8)
-				return 1;
+				return true;
 		}
 	}
 
-	return 0;
+	return false;
 }
 
 static struct xt_match eui64_match = {
diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c
index 740fdcafa5f3..0ed5fbcf1f18 100644
--- a/net/ipv6/netfilter/ip6t_frag.c
+++ b/net/ipv6/netfilter/ip6t_frag.c
@@ -29,10 +29,10 @@ MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
 #endif
 
 /* Returns 1 if the id is matched by the range, 0 otherwise */
-static inline int
-id_match(u_int32_t min, u_int32_t max, u_int32_t id, int invert)
+static inline bool
+id_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert)
 {
-	int r = 0;
+	bool r;
 	DEBUGP("frag id_match:%c 0x%x <= 0x%x <= 0x%x", invert ? '!' : ' ',
 	       min, id, max);
 	r = (id >= min && id <= max) ^ invert;
@@ -40,7 +40,7 @@ id_match(u_int32_t min, u_int32_t max, u_int32_t id, int invert)
 	return r;
 }
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -59,13 +59,13 @@ match(const struct sk_buff *skb,
 	if (err < 0) {
 		if (err != -ENOENT)
 			*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag);
 	if (fh == NULL) {
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	DEBUGP("INFO %04X ", fh->frag_off);
diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c
index 5633de160c6d..4b05393faa68 100644
--- a/net/ipv6/netfilter/ip6t_hbh.c
+++ b/net/ipv6/netfilter/ip6t_hbh.c
@@ -47,7 +47,7 @@ MODULE_ALIAS("ip6t_dst");
  *	5	-> RTALERT 2 x x
  */
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -62,7 +62,7 @@ match(const struct sk_buff *skb,
 	unsigned int temp;
 	unsigned int ptr;
 	unsigned int hdrlen = 0;
-	unsigned int ret = 0;
+	bool ret = false;
 	u8 _opttype, *tp = NULL;
 	u8 _optlen, *lp = NULL;
 	unsigned int optlen;
@@ -72,19 +72,19 @@ match(const struct sk_buff *skb,
 	if (err < 0) {
 		if (err != -ENOENT)
 			*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh);
 	if (oh == NULL) {
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	hdrlen = ipv6_optlen(oh);
 	if (skb->len - ptr < hdrlen) {
 		/* Packet smaller than it's length field */
-		return 0;
+		return false;
 	}
 
 	DEBUGP("IPv6 OPTS LEN %u %u ", hdrlen, oh->hdrlen);
@@ -123,7 +123,7 @@ match(const struct sk_buff *skb,
 				DEBUGP("Tbad %02X %02X\n",
 				       *tp,
 				       (optinfo->opts[temp] & 0xFF00) >> 8);
-				return 0;
+				return false;
 			} else {
 				DEBUGP("Tok ");
 			}
@@ -144,7 +144,7 @@ match(const struct sk_buff *skb,
 				if (spec_len != 0x00FF && spec_len != *lp) {
 					DEBUGP("Lbad %02X %04X\n", *lp,
 					       spec_len);
-					return 0;
+					return false;
 				}
 				DEBUGP("Lok ");
 				optlen = *lp + 2;
@@ -167,10 +167,10 @@ match(const struct sk_buff *skb,
 		if (temp == optinfo->optsnr)
 			return ret;
 		else
-			return 0;
+			return false;
 	}
 
-	return 0;
+	return false;
 }
 
 /* Called when user tries to insert an entry of this type. */
diff --git a/net/ipv6/netfilter/ip6t_hl.c b/net/ipv6/netfilter/ip6t_hl.c
index cbf49cffa067..b933e84a06a4 100644
--- a/net/ipv6/netfilter/ip6t_hl.c
+++ b/net/ipv6/netfilter/ip6t_hl.c
@@ -19,10 +19,10 @@ MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>");
 MODULE_DESCRIPTION("IP tables Hop Limit matching module");
 MODULE_LICENSE("GPL");
 
-static int match(const struct sk_buff *skb,
-		 const struct net_device *in, const struct net_device *out,
-		 const struct xt_match *match, const void *matchinfo,
-		 int offset, unsigned int protoff, bool *hotdrop)
+static bool match(const struct sk_buff *skb,
+		  const struct net_device *in, const struct net_device *out,
+		  const struct xt_match *match, const void *matchinfo,
+		  int offset, unsigned int protoff, bool *hotdrop)
 {
 	const struct ip6t_hl_info *info = matchinfo;
 	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
@@ -43,10 +43,10 @@ static int match(const struct sk_buff *skb,
 		default:
 			printk(KERN_WARNING "ip6t_hl: unknown mode %d\n",
 				info->mode);
-			return 0;
+			return false;
 	}
 
-	return 0;
+	return false;
 }
 
 static struct xt_match hl_match = {
diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c
index 469dec27c649..3222e8959426 100644
--- a/net/ipv6/netfilter/ip6t_ipv6header.c
+++ b/net/ipv6/netfilter/ip6t_ipv6header.c
@@ -26,7 +26,7 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("IPv6 headers match");
 MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
 
-static int
+static bool
 ipv6header_match(const struct sk_buff *skb,
 		 const struct net_device *in,
 		 const struct net_device *out,
@@ -58,7 +58,7 @@ ipv6header_match(const struct sk_buff *skb,
 
 		/* Is there enough space for the next ext header? */
 		if (len < (int)sizeof(struct ipv6_opt_hdr))
-			return 0;
+			return false;
 		/* No more exthdr -> evaluate */
 		if (nexthdr == NEXTHDR_NONE) {
 			temp |= MASK_NONE;
@@ -99,7 +99,7 @@ ipv6header_match(const struct sk_buff *skb,
 			temp |= MASK_DSTOPTS;
 			break;
 		default:
-			return 0;
+			return false;
 			break;
 		}
 
diff --git a/net/ipv6/netfilter/ip6t_mh.c b/net/ipv6/netfilter/ip6t_mh.c
index c27647b6c274..ddffe03a8b37 100644
--- a/net/ipv6/netfilter/ip6t_mh.c
+++ b/net/ipv6/netfilter/ip6t_mh.c
@@ -31,16 +31,13 @@ MODULE_LICENSE("GPL");
 #endif
 
 /* Returns 1 if the type is matched by the range, 0 otherwise */
-static inline int
-type_match(u_int8_t min, u_int8_t max, u_int8_t type, int invert)
+static inline bool
+type_match(u_int8_t min, u_int8_t max, u_int8_t type, bool invert)
 {
-	int ret;
-
-	ret = (type >= min && type <= max) ^ invert;
-	return ret;
+	return (type >= min && type <= max) ^ invert;
 }
 
-static int
+static bool
 match(const struct sk_buff *skb,
 	 const struct net_device *in,
 	 const struct net_device *out,
@@ -55,7 +52,7 @@ match(const struct sk_buff *skb,
 
 	/* Must not be a fragment. */
 	if (offset)
-		return 0;
+		return false;
 
 	mh = skb_header_pointer(skb, protoff, sizeof(_mh), &_mh);
 	if (mh == NULL) {
@@ -63,14 +60,14 @@ match(const struct sk_buff *skb,
 		   can't.  Hence, no choice but to drop. */
 		duprintf("Dropping evil MH tinygram.\n");
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	if (mh->ip6mh_proto != IPPROTO_NONE) {
 		duprintf("Dropping invalid MH Payload Proto: %u\n",
 			 mh->ip6mh_proto);
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	return type_match(mhinfo->types[0], mhinfo->types[1], mh->ip6mh_type,
diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c
index f90f7c32cc9e..cadd0a64fed7 100644
--- a/net/ipv6/netfilter/ip6t_owner.c
+++ b/net/ipv6/netfilter/ip6t_owner.c
@@ -23,7 +23,7 @@ MODULE_DESCRIPTION("IP6 tables owner matching module");
 MODULE_LICENSE("GPL");
 
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -36,21 +36,21 @@ match(const struct sk_buff *skb,
 	const struct ip6t_owner_info *info = matchinfo;
 
 	if (!skb->sk || !skb->sk->sk_socket || !skb->sk->sk_socket->file)
-		return 0;
+		return false;
 
 	if (info->match & IP6T_OWNER_UID) {
 		if ((skb->sk->sk_socket->file->f_uid != info->uid) ^
 		    !!(info->invert & IP6T_OWNER_UID))
-			return 0;
+			return false;
 	}
 
 	if (info->match & IP6T_OWNER_GID) {
 		if ((skb->sk->sk_socket->file->f_gid != info->gid) ^
 		    !!(info->invert & IP6T_OWNER_GID))
-			return 0;
+			return false;
 	}
 
-	return 1;
+	return true;
 }
 
 static int
diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c
index 2bb88214cfda..7966f4a5e9b7 100644
--- a/net/ipv6/netfilter/ip6t_rt.c
+++ b/net/ipv6/netfilter/ip6t_rt.c
@@ -31,10 +31,10 @@ MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
 #endif
 
 /* Returns 1 if the id is matched by the range, 0 otherwise */
-static inline int
-segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, int invert)
+static inline bool
+segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert)
 {
-	int r = 0;
+	bool r;
 	DEBUGP("rt segsleft_match:%c 0x%x <= 0x%x <= 0x%x",
 	       invert ? '!' : ' ', min, id, max);
 	r = (id >= min && id <= max) ^ invert;
@@ -42,7 +42,7 @@ segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, int invert)
 	return r;
 }
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -57,7 +57,7 @@ match(const struct sk_buff *skb,
 	unsigned int temp;
 	unsigned int ptr;
 	unsigned int hdrlen = 0;
-	unsigned int ret = 0;
+	bool ret = false;
 	struct in6_addr *ap, _addr;
 	int err;
 
@@ -65,19 +65,19 @@ match(const struct sk_buff *skb,
 	if (err < 0) {
 		if (err != -ENOENT)
 			*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route);
 	if (rh == NULL) {
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	hdrlen = ipv6_optlen(rh);
 	if (skb->len - ptr < hdrlen) {
 		/* Pcket smaller than its length field */
-		return 0;
+		return false;
 	}
 
 	DEBUGP("IPv6 RT LEN %u %u ", hdrlen, rh->hdrlen);
@@ -136,7 +136,7 @@ match(const struct sk_buff *skb,
 		DEBUGP("Not strict ");
 		if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) {
 			DEBUGP("There isn't enough space\n");
-			return 0;
+			return false;
 		} else {
 			unsigned int i = 0;
 
@@ -164,13 +164,13 @@ match(const struct sk_buff *skb,
 			if (i == rtinfo->addrnr)
 				return ret;
 			else
-				return 0;
+				return false;
 		}
 	} else {
 		DEBUGP("Strict ");
 		if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) {
 			DEBUGP("There isn't enough space\n");
-			return 0;
+			return false;
 		} else {
 			DEBUGP("#%d ", rtinfo->addrnr);
 			for (temp = 0; temp < rtinfo->addrnr; temp++) {
@@ -190,11 +190,11 @@ match(const struct sk_buff *skb,
 			    (temp == (unsigned int)((hdrlen - 8) / 16)))
 				return ret;
 			else
-				return 0;
+				return false;
 		}
 	}
 
-	return 0;
+	return false;
 }
 
 /* Called when user tries to insert an entry of this type. */
diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c
index 20690ea0d466..aa9503ff90ba 100644
--- a/net/netfilter/xt_comment.c
+++ b/net/netfilter/xt_comment.c
@@ -15,7 +15,7 @@ MODULE_LICENSE("GPL");
 MODULE_ALIAS("ipt_comment");
 MODULE_ALIAS("ip6t_comment");
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -26,7 +26,7 @@ match(const struct sk_buff *skb,
       bool *hotdrop)
 {
 	/* We always match */
-	return 1;
+	return true;
 }
 
 static struct xt_match xt_comment_match[] = {
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index 8fe5775901e1..aada7b797549 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -15,7 +15,7 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
 MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection");
 MODULE_ALIAS("ipt_connbytes");
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -35,7 +35,7 @@ match(const struct sk_buff *skb,
 
 	ct = nf_ct_get(skb, &ctinfo);
 	if (!ct)
-		return 0;
+		return false;
 	counters = ct->counters;
 
 	switch (sinfo->what) {
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index 8a6d58ab5d2b..3321b80aff4f 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -30,7 +30,7 @@ MODULE_DESCRIPTION("IP tables connmark match module");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("ipt_connmark");
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -46,7 +46,7 @@ match(const struct sk_buff *skb,
 
 	ct = nf_ct_get(skb, &ctinfo);
 	if (!ct)
-		return 0;
+		return false;
 
 	return (((ct->mark) & info->mask) == info->mark) ^ info->invert;
 }
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 915c730d3b72..26901f95bf4b 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -19,7 +19,7 @@ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
 MODULE_DESCRIPTION("iptables connection tracking match module");
 MODULE_ALIAS("ipt_conntrack");
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -54,53 +54,53 @@ match(const struct sk_buff *skb,
 		}
 		if (FWINV((statebit & sinfo->statemask) == 0,
 			  XT_CONNTRACK_STATE))
-			return 0;
+			return false;
 	}
 
 	if (ct == NULL) {
 		if (sinfo->flags & ~XT_CONNTRACK_STATE)
-			return 0;
-		return 1;
+			return false;
+		return true;
 	}
 
 	if (sinfo->flags & XT_CONNTRACK_PROTO &&
 	    FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum !=
 		  sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum,
 		  XT_CONNTRACK_PROTO))
-		return 0;
+		return false;
 
 	if (sinfo->flags & XT_CONNTRACK_ORIGSRC &&
 	    FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip &
 		   sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) !=
 		  sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip,
 		  XT_CONNTRACK_ORIGSRC))
-		return 0;
+		return false;
 
 	if (sinfo->flags & XT_CONNTRACK_ORIGDST &&
 	    FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip &
 		   sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) !=
 		  sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip,
 		  XT_CONNTRACK_ORIGDST))
-		return 0;
+		return false;
 
 	if (sinfo->flags & XT_CONNTRACK_REPLSRC &&
 	    FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip &
 		   sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) !=
 		  sinfo->tuple[IP_CT_DIR_REPLY].src.ip,
 		  XT_CONNTRACK_REPLSRC))
-		return 0;
+		return false;
 
 	if (sinfo->flags & XT_CONNTRACK_REPLDST &&
 	    FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip &
 		   sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) !=
 		  sinfo->tuple[IP_CT_DIR_REPLY].dst.ip,
 		  XT_CONNTRACK_REPLDST))
-		return 0;
+		return false;
 
 	if (sinfo->flags & XT_CONNTRACK_STATUS &&
 	    FWINV((ct->status & sinfo->statusmask) == 0,
 		  XT_CONNTRACK_STATUS))
-		return 0;
+		return false;
 
 	if(sinfo->flags & XT_CONNTRACK_EXPIRES) {
 		unsigned long expires = timer_pending(&ct->timeout) ?
@@ -109,9 +109,9 @@ match(const struct sk_buff *skb,
 		if (FWINV(!(expires >= sinfo->expires_min &&
 			    expires <= sinfo->expires_max),
 			  XT_CONNTRACK_EXPIRES))
-			return 0;
+			return false;
 	}
-	return 1;
+	return true;
 }
 
 static int
diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c
index 3172e7308b35..b0eba4e2c53f 100644
--- a/net/netfilter/xt_dccp.c
+++ b/net/netfilter/xt_dccp.c
@@ -31,7 +31,7 @@ MODULE_ALIAS("ipt_dccp");
 static unsigned char *dccp_optbuf;
 static DEFINE_SPINLOCK(dccp_buflock);
 
-static inline int
+static inline bool
 dccp_find_option(u_int8_t option,
 		 const struct sk_buff *skb,
 		 unsigned int protoff,
@@ -46,11 +46,11 @@ dccp_find_option(u_int8_t option,
 
 	if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) {
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	if (!optlen)
-		return 0;
+		return false;
 
 	spin_lock_bh(&dccp_buflock);
 	op = skb_header_pointer(skb, protoff + optoff, optlen, dccp_optbuf);
@@ -58,13 +58,13 @@ dccp_find_option(u_int8_t option,
 		/* If we don't have the whole header, drop packet. */
 		spin_unlock_bh(&dccp_buflock);
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	for (i = 0; i < optlen; ) {
 		if (op[i] == option) {
 			spin_unlock_bh(&dccp_buflock);
-			return 1;
+			return true;
 		}
 
 		if (op[i] < 2)
@@ -74,24 +74,24 @@ dccp_find_option(u_int8_t option,
 	}
 
 	spin_unlock_bh(&dccp_buflock);
-	return 0;
+	return false;
 }
 
 
-static inline int
+static inline bool
 match_types(const struct dccp_hdr *dh, u_int16_t typemask)
 {
 	return (typemask & (1 << dh->dccph_type));
 }
 
-static inline int
+static inline bool
 match_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff,
 	     const struct dccp_hdr *dh, bool *hotdrop)
 {
 	return dccp_find_option(option, skb, protoff, dh, hotdrop);
 }
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -105,12 +105,12 @@ match(const struct sk_buff *skb,
 	struct dccp_hdr _dh, *dh;
 
 	if (offset)
-		return 0;
+		return false;
 
 	dh = skb_header_pointer(skb, protoff, sizeof(_dh), &_dh);
 	if (dh == NULL) {
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	return  DCCHECK(((ntohs(dh->dccph_sport) >= info->spts[0])
diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c
index c106d738da6d..c9c6518907a2 100644
--- a/net/netfilter/xt_dscp.c
+++ b/net/netfilter/xt_dscp.c
@@ -22,22 +22,7 @@ MODULE_LICENSE("GPL");
 MODULE_ALIAS("ipt_dscp");
 MODULE_ALIAS("ip6t_dscp");
 
-static int match(const struct sk_buff *skb,
-		 const struct net_device *in,
-		 const struct net_device *out,
-		 const struct xt_match *match,
-		 const void *matchinfo,
-		 int offset,
-		 unsigned int protoff,
-		 bool *hotdrop)
-{
-	const struct xt_dscp_info *info = matchinfo;
-	u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
-
-	return (dscp == info->dscp) ^ !!info->invert;
-}
-
-static int match6(const struct sk_buff *skb,
+static bool match(const struct sk_buff *skb,
 		  const struct net_device *in,
 		  const struct net_device *out,
 		  const struct xt_match *match,
@@ -45,6 +30,21 @@ static int match6(const struct sk_buff *skb,
 		  int offset,
 		  unsigned int protoff,
 		  bool *hotdrop)
+{
+	const struct xt_dscp_info *info = matchinfo;
+	u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
+
+	return (dscp == info->dscp) ^ !!info->invert;
+}
+
+static bool match6(const struct sk_buff *skb,
+		   const struct net_device *in,
+		   const struct net_device *out,
+		   const struct xt_match *match,
+		   const void *matchinfo,
+		   int offset,
+		   unsigned int protoff,
+		   bool *hotdrop)
 {
 	const struct xt_dscp_info *info = matchinfo;
 	u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c
index 5d3421bcd850..1a945cb7c359 100644
--- a/net/netfilter/xt_esp.c
+++ b/net/netfilter/xt_esp.c
@@ -31,10 +31,10 @@ MODULE_ALIAS("ip6t_esp");
 #endif
 
 /* Returns 1 if the spi is matched by the range, 0 otherwise */
-static inline int
-spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert)
+static inline bool
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
 {
-	int r = 0;
+	bool r;
 	duprintf("esp spi_match:%c 0x%x <= 0x%x <= 0x%x", invert ? '!' : ' ',
 		 min, spi, max);
 	r = (spi >= min && spi <= max) ^ invert;
@@ -42,7 +42,7 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert)
 	return r;
 }
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -57,7 +57,7 @@ match(const struct sk_buff *skb,
 
 	/* Must not be a fragment. */
 	if (offset)
-		return 0;
+		return false;
 
 	eh = skb_header_pointer(skb, protoff, sizeof(_esp), &_esp);
 	if (eh == NULL) {
@@ -66,7 +66,7 @@ match(const struct sk_buff *skb,
 		 */
 		duprintf("Dropping evil ESP tinygram.\n");
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	return spi_match(espinfo->spis[0], espinfo->spis[1], ntohl(eh->spi),
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index cd5cba6978c3..21597b755cea 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -94,7 +94,8 @@ static DEFINE_MUTEX(hlimit_mutex);	/* additional checkentry protection */
 static HLIST_HEAD(hashlimit_htables);
 static struct kmem_cache *hashlimit_cachep __read_mostly;
 
-static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b)
+static inline bool dst_cmp(const struct dsthash_ent *ent,
+			   struct dsthash_dst *b)
 {
 	return !memcmp(&ent->dst, b, sizeof(ent->dst));
 }
@@ -227,18 +228,18 @@ static int htable_create(struct xt_hashlimit_info *minfo, int family)
 	return 0;
 }
 
-static int select_all(struct xt_hashlimit_htable *ht, struct dsthash_ent *he)
+static bool select_all(struct xt_hashlimit_htable *ht, struct dsthash_ent *he)
 {
 	return 1;
 }
 
-static int select_gc(struct xt_hashlimit_htable *ht, struct dsthash_ent *he)
+static bool select_gc(struct xt_hashlimit_htable *ht, struct dsthash_ent *he)
 {
 	return (jiffies >= he->expires);
 }
 
 static void htable_selective_cleanup(struct xt_hashlimit_htable *ht,
-				int (*select)(struct xt_hashlimit_htable *ht,
+				bool (*select)(struct xt_hashlimit_htable *ht,
 					      struct dsthash_ent *he))
 {
 	unsigned int i;
@@ -432,7 +433,7 @@ hashlimit_init_dst(struct xt_hashlimit_htable *hinfo, struct dsthash_dst *dst,
 	return 0;
 }
 
-static int
+static bool
 hashlimit_match(const struct sk_buff *skb,
 		const struct net_device *in,
 		const struct net_device *out,
@@ -478,17 +479,17 @@ hashlimit_match(const struct sk_buff *skb,
 		/* We're underlimit. */
 		dh->rateinfo.credit -= dh->rateinfo.cost;
 		spin_unlock_bh(&hinfo->lock);
-		return 1;
+		return true;
 	}
 
 	spin_unlock_bh(&hinfo->lock);
 
 	/* default case: we're overlimit, thus don't match */
-	return 0;
+	return false;
 
 hotdrop:
 	*hotdrop = true;
-	return 0;
+	return false;
 }
 
 static int
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index 0aa090776e27..10c629b34abf 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -28,7 +28,7 @@ MODULE_ALIAS("ip6t_helper");
 #define DEBUGP(format, args...)
 #endif
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -42,7 +42,7 @@ match(const struct sk_buff *skb,
 	struct nf_conn *ct;
 	struct nf_conn_help *master_help;
 	enum ip_conntrack_info ctinfo;
-	int ret = info->invert;
+	bool ret = info->invert;
 
 	ct = nf_ct_get((struct sk_buff *)skb, &ctinfo);
 	if (!ct) {
@@ -67,7 +67,7 @@ match(const struct sk_buff *skb,
 		ct->master->helper->name, info->name);
 
 	if (info->name[0] == '\0')
-		ret ^= 1;
+		ret = !ret;
 	else
 		ret ^= !strncmp(master_help->helper->name, info->name,
 				strlen(master_help->helper->name));
diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c
index 621c9ee6d1c9..57bcfacde594 100644
--- a/net/netfilter/xt_length.c
+++ b/net/netfilter/xt_length.c
@@ -20,7 +20,7 @@ MODULE_LICENSE("GPL");
 MODULE_ALIAS("ipt_length");
 MODULE_ALIAS("ip6t_length");
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -36,7 +36,7 @@ match(const struct sk_buff *skb,
 	return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
 }
 
-static int
+static bool
 match6(const struct sk_buff *skb,
        const struct net_device *in,
        const struct net_device *out,
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index 1133b4ca4904..0cfe241a0493 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -57,7 +57,7 @@ static DEFINE_SPINLOCK(limit_lock);
 
 #define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
 
-static int
+static bool
 ipt_limit_match(const struct sk_buff *skb,
 		const struct net_device *in,
 		const struct net_device *out,
@@ -79,11 +79,11 @@ ipt_limit_match(const struct sk_buff *skb,
 		/* We're not limited. */
 		r->credit -= r->cost;
 		spin_unlock_bh(&limit_lock);
-		return 1;
+		return true;
 	}
 
 	spin_unlock_bh(&limit_lock);
-	return 0;
+	return false;
 }
 
 /* Precision saver. */
diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c
index 0e6a28647206..86022027dd63 100644
--- a/net/netfilter/xt_mac.c
+++ b/net/netfilter/xt_mac.c
@@ -24,7 +24,7 @@ MODULE_DESCRIPTION("iptables mac matching module");
 MODULE_ALIAS("ipt_mac");
 MODULE_ALIAS("ip6t_mac");
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
index 944d1ea56029..10c6799cd56a 100644
--- a/net/netfilter/xt_mark.c
+++ b/net/netfilter/xt_mark.c
@@ -19,7 +19,7 @@ MODULE_DESCRIPTION("iptables mark matching module");
 MODULE_ALIAS("ipt_mark");
 MODULE_ALIAS("ip6t_mark");
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c
index 1dc53ded9887..55feb3d737d4 100644
--- a/net/netfilter/xt_multiport.c
+++ b/net/netfilter/xt_multiport.c
@@ -33,24 +33,24 @@ MODULE_ALIAS("ip6t_multiport");
 #endif
 
 /* Returns 1 if the port is matched by the test, 0 otherwise. */
-static inline int
+static inline bool
 ports_match(const u_int16_t *portlist, enum xt_multiport_flags flags,
 	    u_int8_t count, u_int16_t src, u_int16_t dst)
 {
 	unsigned int i;
 	for (i = 0; i < count; i++) {
 		if (flags != XT_MULTIPORT_DESTINATION && portlist[i] == src)
-			return 1;
+			return true;
 
 		if (flags != XT_MULTIPORT_SOURCE && portlist[i] == dst)
-			return 1;
+			return true;
 	}
 
-	return 0;
+	return false;
 }
 
 /* Returns 1 if the port is matched by the test, 0 otherwise. */
-static inline int
+static inline bool
 ports_match_v1(const struct xt_multiport_v1 *minfo,
 	       u_int16_t src, u_int16_t dst)
 {
@@ -67,34 +67,34 @@ ports_match_v1(const struct xt_multiport_v1 *minfo,
 
 			if (minfo->flags == XT_MULTIPORT_SOURCE
 			    && src >= s && src <= e)
-				return 1 ^ minfo->invert;
+				return true ^ minfo->invert;
 			if (minfo->flags == XT_MULTIPORT_DESTINATION
 			    && dst >= s && dst <= e)
-				return 1 ^ minfo->invert;
+				return true ^ minfo->invert;
 			if (minfo->flags == XT_MULTIPORT_EITHER
 			    && ((dst >= s && dst <= e)
 				|| (src >= s && src <= e)))
-				return 1 ^ minfo->invert;
+				return true ^ minfo->invert;
 		} else {
 			/* exact port matching */
 			duprintf("src or dst matches with %d?\n", s);
 
 			if (minfo->flags == XT_MULTIPORT_SOURCE
 			    && src == s)
-				return 1 ^ minfo->invert;
+				return true ^ minfo->invert;
 			if (minfo->flags == XT_MULTIPORT_DESTINATION
 			    && dst == s)
-				return 1 ^ minfo->invert;
+				return true ^ minfo->invert;
 			if (minfo->flags == XT_MULTIPORT_EITHER
 			    && (src == s || dst == s))
-				return 1 ^ minfo->invert;
+				return true ^ minfo->invert;
 		}
 	}
 
 	return minfo->invert;
 }
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -108,7 +108,7 @@ match(const struct sk_buff *skb,
 	const struct xt_multiport *multiinfo = matchinfo;
 
 	if (offset)
-		return 0;
+		return false;
 
 	pptr = skb_header_pointer(skb, protoff, sizeof(_ports), _ports);
 	if (pptr == NULL) {
@@ -117,7 +117,7 @@ match(const struct sk_buff *skb,
 		 */
 		duprintf("xt_multiport: Dropping evil offset=0 tinygram.\n");
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	return ports_match(multiinfo->ports,
@@ -125,7 +125,7 @@ match(const struct sk_buff *skb,
 			   ntohs(pptr[0]), ntohs(pptr[1]));
 }
 
-static int
+static bool
 match_v1(const struct sk_buff *skb,
 	 const struct net_device *in,
 	 const struct net_device *out,
@@ -139,7 +139,7 @@ match_v1(const struct sk_buff *skb,
 	const struct xt_multiport_v1 *multiinfo = matchinfo;
 
 	if (offset)
-		return 0;
+		return false;
 
 	pptr = skb_header_pointer(skb, protoff, sizeof(_ports), _ports);
 	if (pptr == NULL) {
@@ -148,7 +148,7 @@ match_v1(const struct sk_buff *skb,
 		 */
 		duprintf("xt_multiport: Dropping evil offset=0 tinygram.\n");
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	return ports_match_v1(multiinfo, ntohs(pptr[0]), ntohs(pptr[1]));
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index a6de512fa840..70de6708e884 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -14,8 +14,6 @@
 #include <linux/netfilter/xt_physdev.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_bridge.h>
-#define MATCH   1
-#define NOMATCH 0
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
@@ -23,7 +21,7 @@ MODULE_DESCRIPTION("iptables bridge physical device match module");
 MODULE_ALIAS("ipt_physdev");
 MODULE_ALIAS("ip6t_physdev");
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -36,7 +34,7 @@ match(const struct sk_buff *skb,
 	int i;
 	static const char nulldevname[IFNAMSIZ];
 	const struct xt_physdev_info *info = matchinfo;
-	unsigned int ret;
+	bool ret;
 	const char *indev, *outdev;
 	struct nf_bridge_info *nf_bridge;
 
@@ -47,58 +45,58 @@ match(const struct sk_buff *skb,
 		/* Return MATCH if the invert flags of the used options are on */
 		if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) &&
 		    !(info->invert & XT_PHYSDEV_OP_BRIDGED))
-			return NOMATCH;
+			return false;
 		if ((info->bitmask & XT_PHYSDEV_OP_ISIN) &&
 		    !(info->invert & XT_PHYSDEV_OP_ISIN))
-			return NOMATCH;
+			return false;
 		if ((info->bitmask & XT_PHYSDEV_OP_ISOUT) &&
 		    !(info->invert & XT_PHYSDEV_OP_ISOUT))
-			return NOMATCH;
+			return false;
 		if ((info->bitmask & XT_PHYSDEV_OP_IN) &&
 		    !(info->invert & XT_PHYSDEV_OP_IN))
-			return NOMATCH;
+			return false;
 		if ((info->bitmask & XT_PHYSDEV_OP_OUT) &&
 		    !(info->invert & XT_PHYSDEV_OP_OUT))
-			return NOMATCH;
-		return MATCH;
+			return false;
+		return true;
 	}
 
 	/* This only makes sense in the FORWARD and POSTROUTING chains */
 	if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) &&
 	    (!!(nf_bridge->mask & BRNF_BRIDGED) ^
 	    !(info->invert & XT_PHYSDEV_OP_BRIDGED)))
-		return NOMATCH;
+		return false;
 
 	if ((info->bitmask & XT_PHYSDEV_OP_ISIN &&
 	    (!nf_bridge->physindev ^ !!(info->invert & XT_PHYSDEV_OP_ISIN))) ||
 	    (info->bitmask & XT_PHYSDEV_OP_ISOUT &&
 	    (!nf_bridge->physoutdev ^ !!(info->invert & XT_PHYSDEV_OP_ISOUT))))
-		return NOMATCH;
+		return false;
 
 	if (!(info->bitmask & XT_PHYSDEV_OP_IN))
 		goto match_outdev;
 	indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname;
-	for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) {
+	for (i = 0, ret = false; i < IFNAMSIZ/sizeof(unsigned int); i++) {
 		ret |= (((const unsigned int *)indev)[i]
 			^ ((const unsigned int *)info->physindev)[i])
 			& ((const unsigned int *)info->in_mask)[i];
 	}
 
-	if ((ret == 0) ^ !(info->invert & XT_PHYSDEV_OP_IN))
-		return NOMATCH;
+	if (!ret ^ !(info->invert & XT_PHYSDEV_OP_IN))
+		return false;
 
 match_outdev:
 	if (!(info->bitmask & XT_PHYSDEV_OP_OUT))
-		return MATCH;
+		return true;
 	outdev = nf_bridge->physoutdev ?
 		 nf_bridge->physoutdev->name : nulldevname;
-	for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) {
+	for (i = 0, ret = false; i < IFNAMSIZ/sizeof(unsigned int); i++) {
 		ret |= (((const unsigned int *)outdev)[i]
 			^ ((const unsigned int *)info->physoutdev)[i])
 			& ((const unsigned int *)info->out_mask)[i];
 	}
 
-	return (ret != 0) ^ !(info->invert & XT_PHYSDEV_OP_OUT);
+	return ret ^ !(info->invert & XT_PHYSDEV_OP_OUT);
 }
 
 static int
diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c
index 692581f40c5f..63239727bc22 100644
--- a/net/netfilter/xt_pkttype.c
+++ b/net/netfilter/xt_pkttype.c
@@ -21,7 +21,7 @@ MODULE_DESCRIPTION("IP tables match to match on linklayer packet type");
 MODULE_ALIAS("ipt_pkttype");
 MODULE_ALIAS("ip6t_pkttype");
 
-static int match(const struct sk_buff *skb,
+static bool match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
       const struct xt_match *match,
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index 6878482cd527..0aa487b1f3b8 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -20,7 +20,7 @@ MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
 MODULE_DESCRIPTION("Xtables IPsec policy matching module");
 MODULE_LICENSE("GPL");
 
-static inline int
+static inline bool
 xt_addr_cmp(const union xt_policy_addr *a1, const union xt_policy_addr *m,
 	    const union xt_policy_addr *a2, unsigned short family)
 {
@@ -30,10 +30,10 @@ xt_addr_cmp(const union xt_policy_addr *a1, const union xt_policy_addr *m,
 	case AF_INET6:
 		return !ipv6_masked_addr_cmp(&a1->a6, &m->a6, &a2->a6);
 	}
-	return 0;
+	return false;
 }
 
-static inline int
+static inline bool
 match_xfrm_state(struct xfrm_state *x, const struct xt_policy_elem *e,
 		 unsigned short family)
 {
@@ -108,14 +108,14 @@ match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info,
 	return strict ? i == info->len : 0;
 }
 
-static int match(const struct sk_buff *skb,
-		 const struct net_device *in,
-		 const struct net_device *out,
-		 const struct xt_match *match,
-		 const void *matchinfo,
-		 int offset,
-		 unsigned int protoff,
-		 bool *hotdrop)
+static bool match(const struct sk_buff *skb,
+		  const struct net_device *in,
+		  const struct net_device *out,
+		  const struct xt_match *match,
+		  const void *matchinfo,
+		  int offset,
+		  unsigned int protoff,
+		  bool *hotdrop)
 {
 	const struct xt_policy_info *info = matchinfo;
 	int ret;
@@ -126,9 +126,9 @@ static int match(const struct sk_buff *skb,
 		ret = match_policy_out(skb, info, match->family);
 
 	if (ret < 0)
-		ret = info->flags & XT_POLICY_MATCH_NONE ? 1 : 0;
+		ret = info->flags & XT_POLICY_MATCH_NONE ? true : false;
 	else if (info->flags & XT_POLICY_MATCH_NONE)
-		ret = 0;
+		ret = false;
 
 	return ret;
 }
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index 53c71ac980fc..6091347e38b3 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -16,19 +16,19 @@ MODULE_ALIAS("ip6t_quota");
 
 static DEFINE_SPINLOCK(quota_lock);
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in, const struct net_device *out,
       const struct xt_match *match, const void *matchinfo,
       int offset, unsigned int protoff, bool *hotdrop)
 {
 	struct xt_quota_info *q = ((struct xt_quota_info *)matchinfo)->master;
-	int ret = q->flags & XT_QUOTA_INVERT ? 1 : 0;
+	bool ret = q->flags & XT_QUOTA_INVERT;
 
 	spin_lock_bh(&quota_lock);
 	if (q->quota >= skb->len) {
 		q->quota -= skb->len;
-		ret ^= 1;
+		ret = !ret;
 	} else {
 		/* we do not allow even small packets from now on */
 		q->quota = 0;
diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c
index 41451f57919c..ad82c132694c 100644
--- a/net/netfilter/xt_realm.c
+++ b/net/netfilter/xt_realm.c
@@ -21,7 +21,7 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("X_tables realm match");
 MODULE_ALIAS("ipt_realm");
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index e581afe89098..a118a4c71563 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -23,7 +23,7 @@ MODULE_ALIAS("ipt_sctp");
 #define SCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
 					      || (!!((invflag) & (option)) ^ (cond)))
 
-static int
+static bool
 match_flags(const struct xt_sctp_flag_info *flag_info,
 	    const int flag_count,
 	    u_int8_t chunktype,
@@ -37,10 +37,10 @@ match_flags(const struct xt_sctp_flag_info *flag_info,
 		}
 	}
 
-	return 1;
+	return true;
 }
 
-static inline int
+static inline bool
 match_packet(const struct sk_buff *skb,
 	     unsigned int offset,
 	     const u_int32_t *chunkmap,
@@ -65,7 +65,7 @@ match_packet(const struct sk_buff *skb,
 		if (sch == NULL || sch->length == 0) {
 			duprintf("Dropping invalid SCTP packet.\n");
 			*hotdrop = true;
-			return 0;
+			return false;
 		}
 
 		duprintf("Chunk num: %d\toffset: %d\ttype: %d\tlength: %d\tflags: %x\n",
@@ -80,7 +80,7 @@ match_packet(const struct sk_buff *skb,
 			case SCTP_CHUNK_MATCH_ANY:
 				if (match_flags(flag_info, flag_count,
 					sch->type, sch->flags)) {
-					return 1;
+					return true;
 				}
 				break;
 
@@ -94,14 +94,14 @@ match_packet(const struct sk_buff *skb,
 			case SCTP_CHUNK_MATCH_ONLY:
 				if (!match_flags(flag_info, flag_count,
 					sch->type, sch->flags)) {
-					return 0;
+					return false;
 				}
 				break;
 			}
 		} else {
 			switch (chunk_match_type) {
 			case SCTP_CHUNK_MATCH_ONLY:
-				return 0;
+				return false;
 			}
 		}
 	} while (offset < skb->len);
@@ -110,16 +110,16 @@ match_packet(const struct sk_buff *skb,
 	case SCTP_CHUNK_MATCH_ALL:
 		return SCTP_CHUNKMAP_IS_CLEAR(chunkmap);
 	case SCTP_CHUNK_MATCH_ANY:
-		return 0;
+		return false;
 	case SCTP_CHUNK_MATCH_ONLY:
-		return 1;
+		return true;
 	}
 
 	/* This will never be reached, but required to stop compiler whine */
-	return 0;
+	return false;
 }
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -134,14 +134,14 @@ match(const struct sk_buff *skb,
 
 	if (offset) {
 		duprintf("Dropping non-first fragment.. FIXME\n");
-		return 0;
+		return false;
 	}
 
 	sh = skb_header_pointer(skb, protoff, sizeof(_sh), &_sh);
 	if (sh == NULL) {
 		duprintf("Dropping evil TCP offset=0 tinygram.\n");
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 	duprintf("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest));
 
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index 74fe069fc3aa..f77f74ad5c97 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -20,7 +20,7 @@ MODULE_DESCRIPTION("ip[6]_tables connection tracking state match module");
 MODULE_ALIAS("ipt_state");
 MODULE_ALIAS("ip6t_state");
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index 4e5ed81e9ce1..989924f9024e 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -24,26 +24,26 @@ MODULE_ALIAS("ip6t_statistic");
 
 static DEFINE_SPINLOCK(nth_lock);
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in, const struct net_device *out,
       const struct xt_match *match, const void *matchinfo,
       int offset, unsigned int protoff, bool *hotdrop)
 {
 	struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo;
-	int ret = info->flags & XT_STATISTIC_INVERT ? 1 : 0;
+	bool ret = info->flags & XT_STATISTIC_INVERT;
 
 	switch (info->mode) {
 	case XT_STATISTIC_MODE_RANDOM:
 		if ((net_random() & 0x7FFFFFFF) < info->u.random.probability)
-			ret ^= 1;
+			ret = !ret;
 		break;
 	case XT_STATISTIC_MODE_NTH:
 		info = info->master;
 		spin_lock_bh(&nth_lock);
 		if (info->u.nth.count++ == info->u.nth.every) {
 			info->u.nth.count = 0;
-			ret ^= 1;
+			ret = !ret;
 		}
 		spin_unlock_bh(&nth_lock);
 		break;
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index 7552d8927570..3aea43d37339 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -21,14 +21,14 @@ MODULE_LICENSE("GPL");
 MODULE_ALIAS("ipt_string");
 MODULE_ALIAS("ip6t_string");
 
-static int match(const struct sk_buff *skb,
-		 const struct net_device *in,
-		 const struct net_device *out,
-		 const struct xt_match *match,
-		 const void *matchinfo,
-		 int offset,
-		 unsigned int protoff,
-		 bool *hotdrop)
+static bool match(const struct sk_buff *skb,
+		  const struct net_device *in,
+		  const struct net_device *out,
+		  const struct xt_match *match,
+		  const void *matchinfo,
+		  int offset,
+		  unsigned int protoff,
+		  bool *hotdrop)
 {
 	const struct xt_string_info *conf = matchinfo;
 	struct ts_state state;
diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c
index 0db4f5362180..e9bfd3dd3c81 100644
--- a/net/netfilter/xt_tcpmss.c
+++ b/net/netfilter/xt_tcpmss.c
@@ -23,7 +23,7 @@ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
 MODULE_DESCRIPTION("iptables TCP MSS match module");
 MODULE_ALIAS("ipt_tcpmss");
 
-static int
+static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -78,7 +78,7 @@ out:
 
 dropit:
 	*hotdrop = true;
-	return 0;
+	return false;
 }
 
 static struct xt_match xt_tcpmss_match[] = {
diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c
index ca9ccdd931bc..9ecc4a5bd529 100644
--- a/net/netfilter/xt_tcpudp.c
+++ b/net/netfilter/xt_tcpudp.c
@@ -27,21 +27,18 @@ MODULE_ALIAS("ip6t_tcp");
 
 
 /* Returns 1 if the port is matched by the range, 0 otherwise */
-static inline int
-port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert)
+static inline bool
+port_match(u_int16_t min, u_int16_t max, u_int16_t port, bool invert)
 {
-	int ret;
-
-	ret = (port >= min && port <= max) ^ invert;
-	return ret;
+	return (port >= min && port <= max) ^ invert;
 }
 
-static int
+static bool
 tcp_find_option(u_int8_t option,
 		const struct sk_buff *skb,
 		unsigned int protoff,
 		unsigned int optlen,
-		int invert,
+		bool invert,
 		bool *hotdrop)
 {
 	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
@@ -58,7 +55,7 @@ tcp_find_option(u_int8_t option,
 				optlen, _opt);
 	if (op == NULL) {
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	for (i = 0; i < optlen; ) {
@@ -70,7 +67,7 @@ tcp_find_option(u_int8_t option,
 	return invert;
 }
 
-static int
+static bool
 tcp_match(const struct sk_buff *skb,
 	  const struct net_device *in,
 	  const struct net_device *out,
@@ -95,7 +92,7 @@ tcp_match(const struct sk_buff *skb,
 			*hotdrop = true;
 		}
 		/* Must not be a fragment. */
-		return 0;
+		return false;
 	}
 
 #define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg))
@@ -106,33 +103,33 @@ tcp_match(const struct sk_buff *skb,
 		   can't.  Hence, no choice but to drop. */
 		duprintf("Dropping evil TCP offset=0 tinygram.\n");
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1],
 			ntohs(th->source),
 			!!(tcpinfo->invflags & XT_TCP_INV_SRCPT)))
-		return 0;
+		return false;
 	if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1],
 			ntohs(th->dest),
 			!!(tcpinfo->invflags & XT_TCP_INV_DSTPT)))
-		return 0;
+		return false;
 	if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask)
 		      == tcpinfo->flg_cmp,
 		      XT_TCP_INV_FLAGS))
-		return 0;
+		return false;
 	if (tcpinfo->option) {
 		if (th->doff * 4 < sizeof(_tcph)) {
 			*hotdrop = true;
-			return 0;
+			return false;
 		}
 		if (!tcp_find_option(tcpinfo->option, skb, protoff,
 				     th->doff*4 - sizeof(_tcph),
 				     tcpinfo->invflags & XT_TCP_INV_OPTION,
 				     hotdrop))
-			return 0;
+			return false;
 	}
-	return 1;
+	return true;
 }
 
 /* Called when user tries to insert an entry of this type. */
@@ -149,7 +146,7 @@ tcp_checkentry(const char *tablename,
 	return !(tcpinfo->invflags & ~XT_TCP_INV_MASK);
 }
 
-static int
+static bool
 udp_match(const struct sk_buff *skb,
 	  const struct net_device *in,
 	  const struct net_device *out,
@@ -164,7 +161,7 @@ udp_match(const struct sk_buff *skb,
 
 	/* Must not be a fragment. */
 	if (offset)
-		return 0;
+		return false;
 
 	uh = skb_header_pointer(skb, protoff, sizeof(_udph), &_udph);
 	if (uh == NULL) {
@@ -172,7 +169,7 @@ udp_match(const struct sk_buff *skb,
 		   can't.  Hence, no choice but to drop. */
 		duprintf("Dropping evil UDP tinygram.\n");
 		*hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	return port_match(udpinfo->spts[0], udpinfo->spts[1],
-- 
cgit v1.3-8-gc7d7


From ccb79bdce71f2c04cfa9bfcbaf4d37e2f963d684 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@gmx.de>
Date: Sat, 7 Jul 2007 22:16:00 -0700
Subject: [NETFILTER]: x_tables: switch xt_match->checkentry to bool

Switch the return type of match functions to boolean

Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h   | 10 +++++-----
 net/ipv4/netfilter/ip_tables.c       | 10 +++++-----
 net/ipv4/netfilter/ipt_ah.c          |  6 +++---
 net/ipv4/netfilter/ipt_ecn.c         | 14 +++++++-------
 net/ipv4/netfilter/ipt_owner.c       |  6 +++---
 net/ipv4/netfilter/ipt_recent.c      | 14 +++++++-------
 net/ipv6/netfilter/ip6_tables.c      | 14 +++++++-------
 net/ipv6/netfilter/ip6t_ah.c         |  6 +++---
 net/ipv6/netfilter/ip6t_frag.c       |  6 +++---
 net/ipv6/netfilter/ip6t_hbh.c        |  6 +++---
 net/ipv6/netfilter/ip6t_ipv6header.c |  6 +++---
 net/ipv6/netfilter/ip6t_mh.c         |  2 +-
 net/ipv6/netfilter/ip6t_owner.c      |  6 +++---
 net/ipv6/netfilter/ip6t_rt.c         |  8 ++++----
 net/netfilter/xt_connbytes.c         | 18 +++++++++---------
 net/netfilter/xt_connmark.c          |  8 ++++----
 net/netfilter/xt_conntrack.c         |  6 +++---
 net/netfilter/xt_dccp.c              |  2 +-
 net/netfilter/xt_dscp.c              | 14 +++++++-------
 net/netfilter/xt_esp.c               |  6 +++---
 net/netfilter/xt_hashlimit.c         | 16 ++++++++--------
 net/netfilter/xt_helper.c            | 14 +++++++-------
 net/netfilter/xt_limit.c             |  6 +++---
 net/netfilter/xt_mark.c              |  6 +++---
 net/netfilter/xt_multiport.c         | 10 +++++-----
 net/netfilter/xt_physdev.c           |  8 ++++----
 net/netfilter/xt_policy.c            | 16 ++++++++--------
 net/netfilter/xt_quota.c             |  6 +++---
 net/netfilter/xt_sctp.c              |  2 +-
 net/netfilter/xt_state.c             | 14 +++++++-------
 net/netfilter/xt_statistic.c         |  6 +++---
 net/netfilter/xt_string.c            | 20 ++++++++++----------
 net/netfilter/xt_tcpudp.c            |  4 ++--
 33 files changed, 148 insertions(+), 148 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 304fce356a43..5130dd60a2fc 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -152,11 +152,11 @@ struct xt_match
 
 	/* Called when user tries to insert an entry of this type. */
 	/* Should return true or false. */
-	int (*checkentry)(const char *tablename,
-			  const void *ip,
-			  const struct xt_match *match,
-			  void *matchinfo,
-			  unsigned int hook_mask);
+	bool (*checkentry)(const char *tablename,
+			   const void *ip,
+			   const struct xt_match *match,
+			   void *matchinfo,
+			   unsigned int hook_mask);
 
 	/* Called when entry of this type deleted. */
 	void (*destroy)(const struct xt_match *match, void *matchinfo);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index b9c792dd4890..7962306df585 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -152,20 +152,20 @@ ip_packet_match(const struct iphdr *ip,
 	return 1;
 }
 
-static inline int
+static inline bool
 ip_checkentry(const struct ipt_ip *ip)
 {
 	if (ip->flags & ~IPT_F_MASK) {
 		duprintf("Unknown flag bits set: %08X\n",
 			 ip->flags & ~IPT_F_MASK);
-		return 0;
+		return false;
 	}
 	if (ip->invflags & ~IPT_INV_MASK) {
 		duprintf("Unknown invflag bits set: %08X\n",
 			 ip->invflags & ~IPT_INV_MASK);
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static unsigned int
@@ -2149,7 +2149,7 @@ icmp_match(const struct sk_buff *skb,
 }
 
 /* Called when user tries to insert an entry of this type. */
-static int
+static bool
 icmp_checkentry(const char *tablename,
 	   const void *info,
 	   const struct xt_match *match,
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
index 3da39ee92d8b..6b5b7c9f7392 100644
--- a/net/ipv4/netfilter/ipt_ah.c
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -70,7 +70,7 @@ match(const struct sk_buff *skb,
 }
 
 /* Called when user tries to insert an entry of this type. */
-static int
+static bool
 checkentry(const char *tablename,
 	   const void *ip_void,
 	   const struct xt_match *match,
@@ -82,9 +82,9 @@ checkentry(const char *tablename,
 	/* Must specify no unknown invflags */
 	if (ahinfo->invflags & ~IPT_AH_INV_MASK) {
 		duprintf("ipt_ah: unknown flags %X\n", ahinfo->invflags);
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static struct xt_match ah_match = {
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index ba3a17e0f848..ba4f5497add3 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -87,27 +87,27 @@ static bool match(const struct sk_buff *skb,
 	return true;
 }
 
-static int checkentry(const char *tablename, const void *ip_void,
-		      const struct xt_match *match,
-		      void *matchinfo, unsigned int hook_mask)
+static bool checkentry(const char *tablename, const void *ip_void,
+		       const struct xt_match *match,
+		       void *matchinfo, unsigned int hook_mask)
 {
 	const struct ipt_ecn_info *info = matchinfo;
 	const struct ipt_ip *ip = ip_void;
 
 	if (info->operation & IPT_ECN_OP_MATCH_MASK)
-		return 0;
+		return false;
 
 	if (info->invert & IPT_ECN_OP_MATCH_MASK)
-		return 0;
+		return false;
 
 	if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)
 	    && ip->proto != IPPROTO_TCP) {
 		printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for"
 		       " non-tcp packets\n");
-		return 0;
+		return false;
 	}
 
-	return 1;
+	return true;
 }
 
 static struct xt_match ecn_match = {
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
index 8f441cef5504..deea4b8cc055 100644
--- a/net/ipv4/netfilter/ipt_owner.c
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -51,7 +51,7 @@ match(const struct sk_buff *skb,
 	return true;
 }
 
-static int
+static bool
 checkentry(const char *tablename,
 	   const void *ip,
 	   const struct xt_match *match,
@@ -63,9 +63,9 @@ checkentry(const char *tablename,
 	if (info->match & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
 		printk("ipt_owner: pid, sid and command matching "
 		       "not supported anymore\n");
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static struct xt_match owner_match = {
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index 2e513ed9b6e9..d632e0e6ef16 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -235,7 +235,7 @@ out:
 	return ret;
 }
 
-static int
+static bool
 ipt_recent_checkentry(const char *tablename, const void *ip,
 		      const struct xt_match *match, void *matchinfo,
 		      unsigned int hook_mask)
@@ -243,24 +243,24 @@ ipt_recent_checkentry(const char *tablename, const void *ip,
 	const struct ipt_recent_info *info = matchinfo;
 	struct recent_table *t;
 	unsigned i;
-	int ret = 0;
+	bool ret = false;
 
 	if (hweight8(info->check_set &
 		     (IPT_RECENT_SET | IPT_RECENT_REMOVE |
 		      IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) != 1)
-		return 0;
+		return false;
 	if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) &&
 	    (info->seconds || info->hit_count))
-		return 0;
+		return false;
 	if (info->name[0] == '\0' ||
 	    strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN)
-		return 0;
+		return false;
 
 	mutex_lock(&recent_mutex);
 	t = recent_table_lookup(info->name);
 	if (t != NULL) {
 		t->refcnt++;
-		ret = 1;
+		ret = true;
 		goto out;
 	}
 
@@ -287,7 +287,7 @@ ipt_recent_checkentry(const char *tablename, const void *ip,
 	spin_lock_bh(&recent_lock);
 	list_add_tail(&t->list, &tables);
 	spin_unlock_bh(&recent_lock);
-	ret = 1;
+	ret = true;
 out:
 	mutex_unlock(&recent_mutex);
 	return ret;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 31f42e82184a..7fe4d29708cb 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -188,20 +188,20 @@ ip6_packet_match(const struct sk_buff *skb,
 }
 
 /* should be ip6 safe */
-static inline int
+static inline bool
 ip6_checkentry(const struct ip6t_ip6 *ipv6)
 {
 	if (ipv6->flags & ~IP6T_F_MASK) {
 		duprintf("Unknown flag bits set: %08X\n",
 			 ipv6->flags & ~IP6T_F_MASK);
-		return 0;
+		return false;
 	}
 	if (ipv6->invflags & ~IP6T_INV_MASK) {
 		duprintf("Unknown invflag bits set: %08X\n",
 			 ipv6->invflags & ~IP6T_INV_MASK);
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static unsigned int
@@ -1282,10 +1282,10 @@ void ip6t_unregister_table(struct xt_table *table)
 }
 
 /* Returns 1 if the type and code is matched by the range, 0 otherwise */
-static inline int
+static inline bool
 icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
 		     u_int8_t type, u_int8_t code,
-		     int invert)
+		     bool invert)
 {
 	return (type == test_type && code >= min_code && code <= max_code)
 		^ invert;
@@ -1325,7 +1325,7 @@ icmp6_match(const struct sk_buff *skb,
 }
 
 /* Called when user tries to insert an entry of this type. */
-static int
+static bool
 icmp6_checkentry(const char *tablename,
 	   const void *entry,
 	   const struct xt_match *match,
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index 607c2eb1296f..8fc00bdfc38b 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -103,7 +103,7 @@ match(const struct sk_buff *skb,
 }
 
 /* Called when user tries to insert an entry of this type. */
-static int
+static bool
 checkentry(const char *tablename,
 	  const void *entry,
 	  const struct xt_match *match,
@@ -114,9 +114,9 @@ checkentry(const char *tablename,
 
 	if (ahinfo->invflags & ~IP6T_AH_INV_MASK) {
 		DEBUGP("ip6t_ah: unknown flags %X\n", ahinfo->invflags);
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static struct xt_match ah_match = {
diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c
index 0ed5fbcf1f18..f0aed898e8b7 100644
--- a/net/ipv6/netfilter/ip6t_frag.c
+++ b/net/ipv6/netfilter/ip6t_frag.c
@@ -120,7 +120,7 @@ match(const struct sk_buff *skb,
 }
 
 /* Called when user tries to insert an entry of this type. */
-static int
+static bool
 checkentry(const char *tablename,
 	   const void *ip,
 	   const struct xt_match *match,
@@ -131,9 +131,9 @@ checkentry(const char *tablename,
 
 	if (fraginfo->invflags & ~IP6T_FRAG_INV_MASK) {
 		DEBUGP("ip6t_frag: unknown flags %X\n", fraginfo->invflags);
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static struct xt_match frag_match = {
diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c
index 4b05393faa68..6fdd79785f32 100644
--- a/net/ipv6/netfilter/ip6t_hbh.c
+++ b/net/ipv6/netfilter/ip6t_hbh.c
@@ -174,7 +174,7 @@ match(const struct sk_buff *skb,
 }
 
 /* Called when user tries to insert an entry of this type. */
-static int
+static bool
 checkentry(const char *tablename,
 	   const void *entry,
 	   const struct xt_match *match,
@@ -185,9 +185,9 @@ checkentry(const char *tablename,
 
 	if (optsinfo->invflags & ~IP6T_OPTS_INV_MASK) {
 		DEBUGP("ip6t_opts: unknown flags %X\n", optsinfo->invflags);
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static struct xt_match opts_match[] = {
diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c
index 3222e8959426..5ba6ef0f1b1b 100644
--- a/net/ipv6/netfilter/ip6t_ipv6header.c
+++ b/net/ipv6/netfilter/ip6t_ipv6header.c
@@ -124,7 +124,7 @@ ipv6header_match(const struct sk_buff *skb,
 	}
 }
 
-static int
+static bool
 ipv6header_checkentry(const char *tablename,
 		      const void *ip,
 		      const struct xt_match *match,
@@ -136,9 +136,9 @@ ipv6header_checkentry(const char *tablename,
 	/* invflags is 0 or 0xff in hard mode */
 	if ((!info->modeflag) && info->invflags != 0x00 &&
 	    info->invflags != 0xFF)
-		return 0;
+		return false;
 
-	return 1;
+	return true;
 }
 
 static struct xt_match ip6t_ipv6header_match = {
diff --git a/net/ipv6/netfilter/ip6t_mh.c b/net/ipv6/netfilter/ip6t_mh.c
index ddffe03a8b37..a3008b41d24b 100644
--- a/net/ipv6/netfilter/ip6t_mh.c
+++ b/net/ipv6/netfilter/ip6t_mh.c
@@ -75,7 +75,7 @@ match(const struct sk_buff *skb,
 }
 
 /* Called when user tries to insert an entry of this type. */
-static int
+static bool
 mh_checkentry(const char *tablename,
 	      const void *entry,
 	      const struct xt_match *match,
diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c
index cadd0a64fed7..8cb6c94b4a20 100644
--- a/net/ipv6/netfilter/ip6t_owner.c
+++ b/net/ipv6/netfilter/ip6t_owner.c
@@ -53,7 +53,7 @@ match(const struct sk_buff *skb,
 	return true;
 }
 
-static int
+static bool
 checkentry(const char *tablename,
 	   const void *ip,
 	   const struct xt_match *match,
@@ -65,9 +65,9 @@ checkentry(const char *tablename,
 	if (info->match & (IP6T_OWNER_PID | IP6T_OWNER_SID)) {
 		printk("ipt_owner: pid and sid matching "
 		       "not supported anymore\n");
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static struct xt_match owner_match = {
diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c
index 7966f4a5e9b7..e991ed4a692e 100644
--- a/net/ipv6/netfilter/ip6t_rt.c
+++ b/net/ipv6/netfilter/ip6t_rt.c
@@ -198,7 +198,7 @@ match(const struct sk_buff *skb,
 }
 
 /* Called when user tries to insert an entry of this type. */
-static int
+static bool
 checkentry(const char *tablename,
 	   const void *entry,
 	   const struct xt_match *match,
@@ -209,17 +209,17 @@ checkentry(const char *tablename,
 
 	if (rtinfo->invflags & ~IP6T_RT_INV_MASK) {
 		DEBUGP("ip6t_rt: unknown flags %X\n", rtinfo->invflags);
-		return 0;
+		return false;
 	}
 	if ((rtinfo->flags & (IP6T_RT_RES | IP6T_RT_FST_MASK)) &&
 	    (!(rtinfo->flags & IP6T_RT_TYP) ||
 	     (rtinfo->rt_type != 0) ||
 	     (rtinfo->invflags & IP6T_RT_INV_TYP))) {
 		DEBUGP("`--rt-type 0' required before `--rt-0-*'");
-		return 0;
+		return false;
 	}
 
-	return 1;
+	return true;
 }
 
 static struct xt_match rt_match = {
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index aada7b797549..12541784109a 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -95,31 +95,31 @@ match(const struct sk_buff *skb,
 		return (what >= sinfo->count.from);
 }
 
-static int check(const char *tablename,
-		 const void *ip,
-		 const struct xt_match *match,
-		 void *matchinfo,
-		 unsigned int hook_mask)
+static bool check(const char *tablename,
+		  const void *ip,
+		  const struct xt_match *match,
+		  void *matchinfo,
+		  unsigned int hook_mask)
 {
 	const struct xt_connbytes_info *sinfo = matchinfo;
 
 	if (sinfo->what != XT_CONNBYTES_PKTS &&
 	    sinfo->what != XT_CONNBYTES_BYTES &&
 	    sinfo->what != XT_CONNBYTES_AVGPKT)
-		return 0;
+		return false;
 
 	if (sinfo->direction != XT_CONNBYTES_DIR_ORIGINAL &&
 	    sinfo->direction != XT_CONNBYTES_DIR_REPLY &&
 	    sinfo->direction != XT_CONNBYTES_DIR_BOTH)
-		return 0;
+		return false;
 
 	if (nf_ct_l3proto_try_module_get(match->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
 				    "proto=%d\n", match->family);
-		return 0;
+		return false;
 	}
 
-	return 1;
+	return true;
 }
 
 static void
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index 3321b80aff4f..94d5251b3d88 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -51,7 +51,7 @@ match(const struct sk_buff *skb,
 	return (((ct->mark) & info->mask) == info->mark) ^ info->invert;
 }
 
-static int
+static bool
 checkentry(const char *tablename,
 	   const void *ip,
 	   const struct xt_match *match,
@@ -62,14 +62,14 @@ checkentry(const char *tablename,
 
 	if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) {
 		printk(KERN_WARNING "connmark: only support 32bit mark\n");
-		return 0;
+		return false;
 	}
 	if (nf_ct_l3proto_try_module_get(match->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
 				    "proto=%d\n", match->family);
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static void
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 26901f95bf4b..87364f58a4b9 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -114,7 +114,7 @@ match(const struct sk_buff *skb,
 	return true;
 }
 
-static int
+static bool
 checkentry(const char *tablename,
 	   const void *ip,
 	   const struct xt_match *match,
@@ -124,9 +124,9 @@ checkentry(const char *tablename,
 	if (nf_ct_l3proto_try_module_get(match->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
 				    "proto=%d\n", match->family);
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static void destroy(const struct xt_match *match, void *matchinfo)
diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c
index b0eba4e2c53f..24895902cfe0 100644
--- a/net/netfilter/xt_dccp.c
+++ b/net/netfilter/xt_dccp.c
@@ -126,7 +126,7 @@ match(const struct sk_buff *skb,
 			   XT_DCCP_OPTION, info->flags, info->invflags);
 }
 
-static int
+static bool
 checkentry(const char *tablename,
 	   const void *inf,
 	   const struct xt_match *match,
diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c
index c9c6518907a2..35cabca28eff 100644
--- a/net/netfilter/xt_dscp.c
+++ b/net/netfilter/xt_dscp.c
@@ -52,20 +52,20 @@ static bool match6(const struct sk_buff *skb,
 	return (dscp == info->dscp) ^ !!info->invert;
 }
 
-static int checkentry(const char *tablename,
-		      const void *info,
-		      const struct xt_match *match,
-		      void *matchinfo,
-		      unsigned int hook_mask)
+static bool checkentry(const char *tablename,
+		       const void *info,
+		       const struct xt_match *match,
+		       void *matchinfo,
+		       unsigned int hook_mask)
 {
 	const u_int8_t dscp = ((struct xt_dscp_info *)matchinfo)->dscp;
 
 	if (dscp > XT_DSCP_MAX) {
 		printk(KERN_ERR "xt_dscp: dscp %x out of range\n", dscp);
-		return 0;
+		return false;
 	}
 
-	return 1;
+	return true;
 }
 
 static struct xt_match xt_dscp_match[] = {
diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c
index 1a945cb7c359..1a6ae8a047c7 100644
--- a/net/netfilter/xt_esp.c
+++ b/net/netfilter/xt_esp.c
@@ -74,7 +74,7 @@ match(const struct sk_buff *skb,
 }
 
 /* Called when user tries to insert an entry of this type. */
-static int
+static bool
 checkentry(const char *tablename,
 	   const void *ip_void,
 	   const struct xt_match *match,
@@ -85,10 +85,10 @@ checkentry(const char *tablename,
 
 	if (espinfo->invflags & ~XT_ESP_INV_MASK) {
 		duprintf("xt_esp: unknown flags %X\n", espinfo->invflags);
-		return 0;
+		return false;
 	}
 
-	return 1;
+	return true;
 }
 
 static struct xt_match xt_esp_match[] = {
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 21597b755cea..a1b5996447dd 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -492,7 +492,7 @@ hotdrop:
 	return false;
 }
 
-static int
+static bool
 hashlimit_checkentry(const char *tablename,
 		     const void *inf,
 		     const struct xt_match *match,
@@ -506,20 +506,20 @@ hashlimit_checkentry(const char *tablename,
 	    user2credits(r->cfg.avg * r->cfg.burst) < user2credits(r->cfg.avg)) {
 		printk(KERN_ERR "xt_hashlimit: overflow, try lower: %u/%u\n",
 		       r->cfg.avg, r->cfg.burst);
-		return 0;
+		return false;
 	}
 	if (r->cfg.mode == 0 ||
 	    r->cfg.mode > (XT_HASHLIMIT_HASH_DPT |
 			   XT_HASHLIMIT_HASH_DIP |
 			   XT_HASHLIMIT_HASH_SIP |
 			   XT_HASHLIMIT_HASH_SPT))
-		return 0;
+		return false;
 	if (!r->cfg.gc_interval)
-		return 0;
+		return false;
 	if (!r->cfg.expire)
-		return 0;
+		return false;
 	if (r->name[sizeof(r->name) - 1] != '\0')
-		return 0;
+		return false;
 
 	/* This is the best we've got: We cannot release and re-grab lock,
 	 * since checkentry() is called before x_tables.c grabs xt_mutex.
@@ -531,13 +531,13 @@ hashlimit_checkentry(const char *tablename,
 	r->hinfo = htable_find_get(r->name, match->family);
 	if (!r->hinfo && htable_create(r, match->family) != 0) {
 		mutex_unlock(&hlimit_mutex);
-		return 0;
+		return false;
 	}
 	mutex_unlock(&hlimit_mutex);
 
 	/* Ugly hack: For SMP, we only want to use one set */
 	r->u.master = r;
-	return 1;
+	return true;
 }
 
 static void
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index 10c629b34abf..a2688b807a99 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -76,21 +76,21 @@ out_unlock:
 	return ret;
 }
 
-static int check(const char *tablename,
-		 const void *inf,
-		 const struct xt_match *match,
-		 void *matchinfo,
-		 unsigned int hook_mask)
+static bool check(const char *tablename,
+		  const void *inf,
+		  const struct xt_match *match,
+		  void *matchinfo,
+		  unsigned int hook_mask)
 {
 	struct xt_helper_info *info = matchinfo;
 
 	if (nf_ct_l3proto_try_module_get(match->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
 				    "proto=%d\n", match->family);
-		return 0;
+		return false;
 	}
 	info->name[29] = '\0';
-	return 1;
+	return true;
 }
 
 static void
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index 0cfe241a0493..2717aa65246a 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -98,7 +98,7 @@ user2credits(u_int32_t user)
 	return (user * HZ * CREDITS_PER_JIFFY) / XT_LIMIT_SCALE;
 }
 
-static int
+static bool
 ipt_limit_checkentry(const char *tablename,
 		     const void *inf,
 		     const struct xt_match *match,
@@ -112,7 +112,7 @@ ipt_limit_checkentry(const char *tablename,
 	    || user2credits(r->avg * r->burst) < user2credits(r->avg)) {
 		printk("Overflow in xt_limit, try lower: %u/%u\n",
 		       r->avg, r->burst);
-		return 0;
+		return false;
 	}
 
 	/* For SMP, we only want to use one set of counters. */
@@ -125,7 +125,7 @@ ipt_limit_checkentry(const char *tablename,
 		r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */
 		r->cost = user2credits(r->avg);
 	}
-	return 1;
+	return true;
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
index 10c6799cd56a..83ed806764b4 100644
--- a/net/netfilter/xt_mark.c
+++ b/net/netfilter/xt_mark.c
@@ -34,7 +34,7 @@ match(const struct sk_buff *skb,
 	return ((skb->mark & info->mask) == info->mark) ^ info->invert;
 }
 
-static int
+static bool
 checkentry(const char *tablename,
 	   const void *entry,
 	   const struct xt_match *match,
@@ -45,9 +45,9 @@ checkentry(const char *tablename,
 
 	if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) {
 		printk(KERN_WARNING "mark: only supports 32bit mark\n");
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c
index 55feb3d737d4..3d69d6208965 100644
--- a/net/netfilter/xt_multiport.c
+++ b/net/netfilter/xt_multiport.c
@@ -154,7 +154,7 @@ match_v1(const struct sk_buff *skb,
 	return ports_match_v1(multiinfo, ntohs(pptr[0]), ntohs(pptr[1]));
 }
 
-static inline int
+static inline bool
 check(u_int16_t proto,
       u_int8_t ip_invflags,
       u_int8_t match_flags,
@@ -172,7 +172,7 @@ check(u_int16_t proto,
 }
 
 /* Called when user tries to insert an entry of this type. */
-static int
+static bool
 checkentry(const char *tablename,
 	   const void *info,
 	   const struct xt_match *match,
@@ -186,7 +186,7 @@ checkentry(const char *tablename,
 		     multiinfo->count);
 }
 
-static int
+static bool
 checkentry_v1(const char *tablename,
 	      const void *info,
 	      const struct xt_match *match,
@@ -200,7 +200,7 @@ checkentry_v1(const char *tablename,
 		     multiinfo->count);
 }
 
-static int
+static bool
 checkentry6(const char *tablename,
 	    const void *info,
 	    const struct xt_match *match,
@@ -214,7 +214,7 @@ checkentry6(const char *tablename,
 		     multiinfo->count);
 }
 
-static int
+static bool
 checkentry6_v1(const char *tablename,
 	       const void *info,
 	       const struct xt_match *match,
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index 70de6708e884..34f0d3e44ea7 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -99,7 +99,7 @@ match_outdev:
 	return ret ^ !(info->invert & XT_PHYSDEV_OP_OUT);
 }
 
-static int
+static bool
 checkentry(const char *tablename,
 		       const void *ip,
 		       const struct xt_match *match,
@@ -110,7 +110,7 @@ checkentry(const char *tablename,
 
 	if (!(info->bitmask & XT_PHYSDEV_OP_MASK) ||
 	    info->bitmask & ~XT_PHYSDEV_OP_MASK)
-		return 0;
+		return false;
 	if (info->bitmask & XT_PHYSDEV_OP_OUT &&
 	    (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) ||
 	     info->invert & XT_PHYSDEV_OP_BRIDGED) &&
@@ -120,9 +120,9 @@ checkentry(const char *tablename,
 		       "OUTPUT, FORWARD and POSTROUTING chains for non-bridged "
 		       "traffic is not supported anymore.\n");
 		if (hook_mask & (1 << NF_IP_LOCAL_OUT))
-			return 0;
+			return false;
 	}
-	return 1;
+	return true;
 }
 
 static struct xt_match xt_physdev_match[] = {
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index 0aa487b1f3b8..1534de55cdb6 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -133,35 +133,35 @@ static bool match(const struct sk_buff *skb,
 	return ret;
 }
 
-static int checkentry(const char *tablename, const void *ip_void,
-		      const struct xt_match *match,
-		      void *matchinfo, unsigned int hook_mask)
+static bool checkentry(const char *tablename, const void *ip_void,
+		       const struct xt_match *match,
+		       void *matchinfo, unsigned int hook_mask)
 {
 	struct xt_policy_info *info = matchinfo;
 
 	if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) {
 		printk(KERN_ERR "xt_policy: neither incoming nor "
 				"outgoing policy selected\n");
-		return 0;
+		return false;
 	}
 	/* hook values are equal for IPv4 and IPv6 */
 	if (hook_mask & (1 << NF_IP_PRE_ROUTING | 1 << NF_IP_LOCAL_IN)
 	    && info->flags & XT_POLICY_MATCH_OUT) {
 		printk(KERN_ERR "xt_policy: output policy not valid in "
 				"PRE_ROUTING and INPUT\n");
-		return 0;
+		return false;
 	}
 	if (hook_mask & (1 << NF_IP_POST_ROUTING | 1 << NF_IP_LOCAL_OUT)
 	    && info->flags & XT_POLICY_MATCH_IN) {
 		printk(KERN_ERR "xt_policy: input policy not valid in "
 				"POST_ROUTING and OUTPUT\n");
-		return 0;
+		return false;
 	}
 	if (info->len > XT_POLICY_MAX_ELEM) {
 		printk(KERN_ERR "xt_policy: too many policy elements\n");
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static struct xt_match xt_policy_match[] = {
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index 6091347e38b3..e13d62a8caba 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -38,7 +38,7 @@ match(const struct sk_buff *skb,
 	return ret;
 }
 
-static int
+static bool
 checkentry(const char *tablename, const void *entry,
 	   const struct xt_match *match, void *matchinfo,
 	   unsigned int hook_mask)
@@ -46,10 +46,10 @@ checkentry(const char *tablename, const void *entry,
 	struct xt_quota_info *q = (struct xt_quota_info *)matchinfo;
 
 	if (q->flags & ~XT_QUOTA_MASK)
-		return 0;
+		return false;
 	/* For SMP, we only want to use one set of counters. */
 	q->master = q;
-	return 1;
+	return true;
 }
 
 static struct xt_match xt_quota_match[] = {
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index a118a4c71563..22df338b3934 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -158,7 +158,7 @@ match(const struct sk_buff *skb,
 			   XT_SCTP_CHUNK_TYPES, info->flags, info->invflags);
 }
 
-static int
+static bool
 checkentry(const char *tablename,
 	   const void *inf,
 	   const struct xt_match *match,
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index f77f74ad5c97..5b9c59aa14d3 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -44,18 +44,18 @@ match(const struct sk_buff *skb,
 	return (sinfo->statemask & statebit);
 }
 
-static int check(const char *tablename,
-		 const void *inf,
-		 const struct xt_match *match,
-		 void *matchinfo,
-		 unsigned int hook_mask)
+static bool check(const char *tablename,
+		  const void *inf,
+		  const struct xt_match *match,
+		  void *matchinfo,
+		  unsigned int hook_mask)
 {
 	if (nf_ct_l3proto_try_module_get(match->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
 				    "proto=%d\n", match->family);
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static void
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index 989924f9024e..0af42892e9dc 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -52,7 +52,7 @@ match(const struct sk_buff *skb,
 	return ret;
 }
 
-static int
+static bool
 checkentry(const char *tablename, const void *entry,
 	   const struct xt_match *match, void *matchinfo,
 	   unsigned int hook_mask)
@@ -61,9 +61,9 @@ checkentry(const char *tablename, const void *entry,
 
 	if (info->mode > XT_STATISTIC_MODE_MAX ||
 	    info->flags & ~XT_STATISTIC_MASK)
-		return 0;
+		return false;
 	info->master = info;
-	return 1;
+	return true;
 }
 
 static struct xt_match xt_statistic_match[] = {
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index 3aea43d37339..ab761b17f811 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -42,30 +42,30 @@ static bool match(const struct sk_buff *skb,
 
 #define STRING_TEXT_PRIV(m) ((struct xt_string_info *) m)
 
-static int checkentry(const char *tablename,
-		      const void *ip,
-		      const struct xt_match *match,
-		      void *matchinfo,
-		      unsigned int hook_mask)
+static bool checkentry(const char *tablename,
+		       const void *ip,
+		       const struct xt_match *match,
+		       void *matchinfo,
+		       unsigned int hook_mask)
 {
 	struct xt_string_info *conf = matchinfo;
 	struct ts_config *ts_conf;
 
 	/* Damn, can't handle this case properly with iptables... */
 	if (conf->from_offset > conf->to_offset)
-		return 0;
+		return false;
 	if (conf->algo[XT_STRING_MAX_ALGO_NAME_SIZE - 1] != '\0')
-		return 0;
+		return false;
 	if (conf->patlen > XT_STRING_MAX_PATTERN_SIZE)
-		return 0;
+		return false;
 	ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen,
 				     GFP_KERNEL, TS_AUTOLOAD);
 	if (IS_ERR(ts_conf))
-		return 0;
+		return false;
 
 	conf->config = ts_conf;
 
-	return 1;
+	return true;
 }
 
 static void destroy(const struct xt_match *match, void *matchinfo)
diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c
index 9ecc4a5bd529..0dd3022cc79a 100644
--- a/net/netfilter/xt_tcpudp.c
+++ b/net/netfilter/xt_tcpudp.c
@@ -133,7 +133,7 @@ tcp_match(const struct sk_buff *skb,
 }
 
 /* Called when user tries to insert an entry of this type. */
-static int
+static bool
 tcp_checkentry(const char *tablename,
 	       const void *info,
 	       const struct xt_match *match,
@@ -181,7 +181,7 @@ udp_match(const struct sk_buff *skb,
 }
 
 /* Called when user tries to insert an entry of this type. */
-static int
+static bool
 udp_checkentry(const char *tablename,
 	       const void *info,
 	       const struct xt_match *match,
-- 
cgit v1.3-8-gc7d7


From e1931b784a8de324abf310fa3b5e3f25d3988233 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@gmx.de>
Date: Sat, 7 Jul 2007 22:16:26 -0700
Subject: [NETFILTER]: x_tables: switch xt_target->checkentry to bool

Switch the return type of target checkentry functions to boolean.

Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h  | 10 +++++-----
 net/ipv4/netfilter/arpt_mangle.c    |  8 ++++----
 net/ipv4/netfilter/ipt_CLUSTERIP.c  | 28 ++++++++++++++--------------
 net/ipv4/netfilter/ipt_ECN.c        | 30 +++++++++++++++---------------
 net/ipv4/netfilter/ipt_LOG.c        | 16 ++++++++--------
 net/ipv4/netfilter/ipt_MASQUERADE.c |  8 ++++----
 net/ipv4/netfilter/ipt_NETMAP.c     |  8 ++++----
 net/ipv4/netfilter/ipt_REDIRECT.c   |  8 ++++----
 net/ipv4/netfilter/ipt_REJECT.c     | 16 ++++++++--------
 net/ipv4/netfilter/ipt_SAME.c       | 14 +++++++-------
 net/ipv4/netfilter/ipt_TOS.c        |  6 +++---
 net/ipv4/netfilter/ipt_TTL.c        |  8 ++++----
 net/ipv4/netfilter/ipt_ULOG.c       | 16 ++++++++--------
 net/ipv4/netfilter/nf_nat_rule.c    | 28 ++++++++++++++--------------
 net/ipv6/netfilter/ip6t_HL.c        |  8 ++++----
 net/ipv6/netfilter/ip6t_LOG.c       | 16 ++++++++--------
 net/ipv6/netfilter/ip6t_REJECT.c    | 16 ++++++++--------
 net/netfilter/xt_CONNMARK.c         | 10 +++++-----
 net/netfilter/xt_CONNSECMARK.c      | 12 ++++++------
 net/netfilter/xt_DSCP.c             | 14 +++++++-------
 net/netfilter/xt_MARK.c             | 14 +++++++-------
 net/netfilter/xt_NFLOG.c            |  8 ++++----
 net/netfilter/xt_SECMARK.c          | 24 ++++++++++++------------
 net/netfilter/xt_TCPMSS.c           | 22 +++++++++++-----------
 24 files changed, 174 insertions(+), 174 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 5130dd60a2fc..64f425a855bb 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -202,11 +202,11 @@ struct xt_target
            hook_mask is a bitmask of hooks from which it can be
            called. */
 	/* Should return true or false. */
-	int (*checkentry)(const char *tablename,
-			  const void *entry,
-			  const struct xt_target *target,
-			  void *targinfo,
-			  unsigned int hook_mask);
+	bool (*checkentry)(const char *tablename,
+			   const void *entry,
+			   const struct xt_target *target,
+			   void *targinfo,
+			   unsigned int hook_mask);
 
 	/* Called when entry of this type deleted. */
 	void (*destroy)(const struct xt_target *target, void *targinfo);
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index 6298d404e7c7..497a16e0b064 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -65,7 +65,7 @@ target(struct sk_buff **pskb,
 	return mangle->target;
 }
 
-static int
+static bool
 checkentry(const char *tablename, const void *e, const struct xt_target *target,
 	   void *targinfo, unsigned int hook_mask)
 {
@@ -73,12 +73,12 @@ checkentry(const char *tablename, const void *e, const struct xt_target *target,
 
 	if (mangle->flags & ~ARPT_MANGLE_MASK ||
 	    !(mangle->flags & ARPT_MANGLE_MASK))
-		return 0;
+		return false;
 
 	if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
 	   mangle->target != ARPT_CONTINUE)
-		return 0;
-	return 1;
+		return false;
+	return true;
 }
 
 static struct arpt_target arpt_mangle_reg = {
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 40e273421398..e82339a78c01 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -220,17 +220,17 @@ clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
 	return 0;
 }
 
-static int
+static bool
 clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
 {
 	if (nodenum == 0 ||
 	    nodenum > c->num_total_nodes)
-		return 1;
+		return true;
 
 	if (test_and_clear_bit(nodenum - 1, &c->local_nodes))
-		return 0;
+		return false;
 
-	return 1;
+	return true;
 }
 #endif
 
@@ -370,7 +370,7 @@ target(struct sk_buff **pskb,
 	return XT_CONTINUE;
 }
 
-static int
+static bool
 checkentry(const char *tablename,
 	   const void *e_void,
 	   const struct xt_target *target,
@@ -387,13 +387,13 @@ checkentry(const char *tablename,
 	    cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
 		printk(KERN_WARNING "CLUSTERIP: unknown mode `%u'\n",
 			cipinfo->hash_mode);
-		return 0;
+		return false;
 
 	}
 	if (e->ip.dmsk.s_addr != htonl(0xffffffff)
 	    || e->ip.dst.s_addr == 0) {
 		printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n");
-		return 0;
+		return false;
 	}
 
 	/* FIXME: further sanity checks */
@@ -407,7 +407,7 @@ checkentry(const char *tablename,
 			if (cipinfo->config != config) {
 				printk(KERN_ERR "CLUSTERIP: Reloaded entry "
 				       "has invalid config pointer!\n");
-				return 0;
+				return false;
 			}
 		} else {
 			/* Case B: This is a new rule referring to an existing
@@ -418,19 +418,19 @@ checkentry(const char *tablename,
 		/* Case C: This is a completely new clusterip config */
 		if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
 			printk(KERN_WARNING "CLUSTERIP: no config found for %u.%u.%u.%u, need 'new'\n", NIPQUAD(e->ip.dst.s_addr));
-			return 0;
+			return false;
 		} else {
 			struct net_device *dev;
 
 			if (e->ip.iniface[0] == '\0') {
 				printk(KERN_WARNING "CLUSTERIP: Please specify an interface name\n");
-				return 0;
+				return false;
 			}
 
 			dev = dev_get_by_name(e->ip.iniface);
 			if (!dev) {
 				printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface);
-				return 0;
+				return false;
 			}
 
 			config = clusterip_config_init(cipinfo,
@@ -438,7 +438,7 @@ checkentry(const char *tablename,
 			if (!config) {
 				printk(KERN_WARNING "CLUSTERIP: cannot allocate config\n");
 				dev_put(dev);
-				return 0;
+				return false;
 			}
 			dev_mc_add(config->dev,config->clustermac, ETH_ALEN, 0);
 		}
@@ -448,10 +448,10 @@ checkentry(const char *tablename,
 	if (nf_ct_l3proto_try_module_get(target->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
 				    "proto=%d\n", target->family);
-		return 0;
+		return false;
 	}
 
-	return 1;
+	return true;
 }
 
 /* drop reference count of cluster config when rule is deleted */
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index 918ca92e534a..02367012fc74 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -24,8 +24,8 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
 MODULE_DESCRIPTION("iptables ECN modification module");
 
 /* set ECT codepoint from IP header.
- * 	return 0 if there was an error. */
-static inline int
+ * 	return false if there was an error. */
+static inline bool
 set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
 {
 	struct iphdr *iph = ip_hdr(*pskb);
@@ -33,18 +33,18 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
 	if ((iph->tos & IPT_ECN_IP_MASK) != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
 		__u8 oldtos;
 		if (!skb_make_writable(pskb, sizeof(struct iphdr)))
-			return 0;
+			return false;
 		iph = ip_hdr(*pskb);
 		oldtos = iph->tos;
 		iph->tos &= ~IPT_ECN_IP_MASK;
 		iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK);
 		nf_csum_replace2(&iph->check, htons(oldtos), htons(iph->tos));
 	}
-	return 1;
+	return true;
 }
 
-/* Return 0 if there was an error. */
-static inline int
+/* Return false if there was an error. */
+static inline bool
 set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
 {
 	struct tcphdr _tcph, *tcph;
@@ -54,16 +54,16 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
 	tcph = skb_header_pointer(*pskb, ip_hdrlen(*pskb),
 				  sizeof(_tcph), &_tcph);
 	if (!tcph)
-		return 0;
+		return false;
 
 	if ((!(einfo->operation & IPT_ECN_OP_SET_ECE) ||
 	     tcph->ece == einfo->proto.tcp.ece) &&
 	    ((!(einfo->operation & IPT_ECN_OP_SET_CWR) ||
 	     tcph->cwr == einfo->proto.tcp.cwr)))
-		return 1;
+		return true;
 
 	if (!skb_make_writable(pskb, ip_hdrlen(*pskb) + sizeof(*tcph)))
-		return 0;
+		return false;
 	tcph = (void *)ip_hdr(*pskb) + ip_hdrlen(*pskb);
 
 	oldval = ((__be16 *)tcph)[6];
@@ -74,7 +74,7 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
 
 	nf_proto_csum_replace2(&tcph->check, *pskb,
 				oldval, ((__be16 *)tcph)[6], 0);
-	return 1;
+	return true;
 }
 
 static unsigned int
@@ -99,7 +99,7 @@ target(struct sk_buff **pskb,
 	return XT_CONTINUE;
 }
 
-static int
+static bool
 checkentry(const char *tablename,
 	   const void *e_void,
 	   const struct xt_target *target,
@@ -112,20 +112,20 @@ checkentry(const char *tablename,
 	if (einfo->operation & IPT_ECN_OP_MASK) {
 		printk(KERN_WARNING "ECN: unsupported ECN operation %x\n",
 			einfo->operation);
-		return 0;
+		return false;
 	}
 	if (einfo->ip_ect & ~IPT_ECN_IP_MASK) {
 		printk(KERN_WARNING "ECN: new ECT codepoint %x out of mask\n",
 			einfo->ip_ect);
-		return 0;
+		return false;
 	}
 	if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR))
 	    && (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) {
 		printk(KERN_WARNING "ECN: cannot use TCP operations on a "
 		       "non-tcp rule\n");
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static struct xt_target ipt_ecn_reg = {
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index a42c5cd968b1..bbff6c352ef8 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -435,24 +435,24 @@ ipt_log_target(struct sk_buff **pskb,
 	return XT_CONTINUE;
 }
 
-static int ipt_log_checkentry(const char *tablename,
-			      const void *e,
-			      const struct xt_target *target,
-			      void *targinfo,
-			      unsigned int hook_mask)
+static bool ipt_log_checkentry(const char *tablename,
+			       const void *e,
+			       const struct xt_target *target,
+			       void *targinfo,
+			       unsigned int hook_mask)
 {
 	const struct ipt_log_info *loginfo = targinfo;
 
 	if (loginfo->level >= 8) {
 		DEBUGP("LOG: level %u >= 8\n", loginfo->level);
-		return 0;
+		return false;
 	}
 	if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
 		DEBUGP("LOG: prefix term %i\n",
 		       loginfo->prefix[sizeof(loginfo->prefix)-1]);
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static struct xt_target ipt_log_reg = {
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index d4f2d7775330..b5b216408ee7 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -37,7 +37,7 @@ MODULE_DESCRIPTION("iptables MASQUERADE target module");
 static DEFINE_RWLOCK(masq_lock);
 
 /* FIXME: Multiple targets. --RR */
-static int
+static bool
 masquerade_check(const char *tablename,
 		 const void *e,
 		 const struct xt_target *target,
@@ -48,13 +48,13 @@ masquerade_check(const char *tablename,
 
 	if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
 		DEBUGP("masquerade_check: bad MAP_IPS.\n");
-		return 0;
+		return false;
 	}
 	if (mr->rangesize != 1) {
 		DEBUGP("masquerade_check: bad rangesize %u.\n", mr->rangesize);
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static unsigned int
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index 068c69bce30e..a902c71218bf 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -29,7 +29,7 @@ MODULE_DESCRIPTION("iptables 1:1 NAT mapping of IP networks target");
 #define DEBUGP(format, args...)
 #endif
 
-static int
+static bool
 check(const char *tablename,
       const void *e,
       const struct xt_target *target,
@@ -40,13 +40,13 @@ check(const char *tablename,
 
 	if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) {
 		DEBUGP(MODULENAME":check: bad MAP_IPS.\n");
-		return 0;
+		return false;
 	}
 	if (mr->rangesize != 1) {
 		DEBUGP(MODULENAME":check: bad rangesize %u.\n", mr->rangesize);
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static unsigned int
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index 68cc76a198eb..2a04103b50d1 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -32,7 +32,7 @@ MODULE_DESCRIPTION("iptables REDIRECT target module");
 #endif
 
 /* FIXME: Take multiple ranges --RR */
-static int
+static bool
 redirect_check(const char *tablename,
 	       const void *e,
 	       const struct xt_target *target,
@@ -43,13 +43,13 @@ redirect_check(const char *tablename,
 
 	if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
 		DEBUGP("redirect_check: bad MAP_IPS.\n");
-		return 0;
+		return false;
 	}
 	if (mr->rangesize != 1) {
 		DEBUGP("redirect_check: bad rangesize %u.\n", mr->rangesize);
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static unsigned int
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 9041e0741f6f..5c3270d325f3 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -217,27 +217,27 @@ static unsigned int reject(struct sk_buff **pskb,
 	return NF_DROP;
 }
 
-static int check(const char *tablename,
-		 const void *e_void,
-		 const struct xt_target *target,
-		 void *targinfo,
-		 unsigned int hook_mask)
+static bool check(const char *tablename,
+		  const void *e_void,
+		  const struct xt_target *target,
+		  void *targinfo,
+		  unsigned int hook_mask)
 {
 	const struct ipt_reject_info *rejinfo = targinfo;
 	const struct ipt_entry *e = e_void;
 
 	if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
 		printk("REJECT: ECHOREPLY no longer supported.\n");
-		return 0;
+		return false;
 	} else if (rejinfo->with == IPT_TCP_RESET) {
 		/* Must specify that it's a TCP packet */
 		if (e->ip.proto != IPPROTO_TCP
 		    || (e->ip.invflags & XT_INV_PROTO)) {
 			DEBUGP("REJECT: TCP_RESET invalid for non-tcp\n");
-			return 0;
+			return false;
 		}
 	}
-	return 1;
+	return true;
 }
 
 static struct xt_target ipt_reject_reg = {
diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c
index 511e5ff84938..3649fabc04ea 100644
--- a/net/ipv4/netfilter/ipt_SAME.c
+++ b/net/ipv4/netfilter/ipt_SAME.c
@@ -33,7 +33,7 @@ MODULE_DESCRIPTION("iptables special SNAT module for consistent sourceip");
 #define DEBUGP(format, args...)
 #endif
 
-static int
+static bool
 same_check(const char *tablename,
 	      const void *e,
 	      const struct xt_target *target,
@@ -47,13 +47,13 @@ same_check(const char *tablename,
 
 	if (mr->rangesize < 1) {
 		DEBUGP("same_check: need at least one dest range.\n");
-		return 0;
+		return false;
 	}
 	if (mr->rangesize > IPT_SAME_MAX_RANGE) {
 		DEBUGP("same_check: too many ranges specified, maximum "
 				"is %u ranges\n",
 				IPT_SAME_MAX_RANGE);
-		return 0;
+		return false;
 	}
 	for (count = 0; count < mr->rangesize; count++) {
 		if (ntohl(mr->range[count].min_ip) >
@@ -62,11 +62,11 @@ same_check(const char *tablename,
 				"range `%u.%u.%u.%u-%u.%u.%u.%u'.\n",
 				NIPQUAD(mr->range[count].min_ip),
 				NIPQUAD(mr->range[count].max_ip));
-			return 0;
+			return false;
 		}
 		if (!(mr->range[count].flags & IP_NAT_RANGE_MAP_IPS)) {
 			DEBUGP("same_check: bad MAP_IPS.\n");
-			return 0;
+			return false;
 		}
 		rangeip = (ntohl(mr->range[count].max_ip) -
 					ntohl(mr->range[count].min_ip) + 1);
@@ -81,7 +81,7 @@ same_check(const char *tablename,
 		DEBUGP("same_check: Couldn't allocate %u bytes "
 			"for %u ipaddresses!\n",
 			(sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
-		return 0;
+		return false;
 	}
 	DEBUGP("same_check: Allocated %u bytes for %u ipaddresses.\n",
 			(sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
@@ -97,7 +97,7 @@ same_check(const char *tablename,
 			index++;
 		}
 	}
-	return 1;
+	return true;
 }
 
 static void
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
index 0ad02f249837..ac43e86afbcf 100644
--- a/net/ipv4/netfilter/ipt_TOS.c
+++ b/net/ipv4/netfilter/ipt_TOS.c
@@ -43,7 +43,7 @@ target(struct sk_buff **pskb,
 	return XT_CONTINUE;
 }
 
-static int
+static bool
 checkentry(const char *tablename,
 	   const void *e_void,
 	   const struct xt_target *target,
@@ -58,9 +58,9 @@ checkentry(const char *tablename,
 	    && tos != IPTOS_MINCOST
 	    && tos != IPTOS_NORMALSVC) {
 		printk(KERN_WARNING "TOS: bad tos value %#x\n", tos);
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static struct xt_target ipt_tos_reg = {
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
index a991ec7bd4e7..96b6e3514c22 100644
--- a/net/ipv4/netfilter/ipt_TTL.c
+++ b/net/ipv4/netfilter/ipt_TTL.c
@@ -62,7 +62,7 @@ ipt_ttl_target(struct sk_buff **pskb,
 	return XT_CONTINUE;
 }
 
-static int ipt_ttl_checkentry(const char *tablename,
+static bool ipt_ttl_checkentry(const char *tablename,
 		const void *e,
 		const struct xt_target *target,
 		void *targinfo,
@@ -73,11 +73,11 @@ static int ipt_ttl_checkentry(const char *tablename,
 	if (info->mode > IPT_TTL_MAXMODE) {
 		printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n",
 			info->mode);
-		return 0;
+		return false;
 	}
 	if ((info->mode != IPT_TTL_SET) && (info->ttl == 0))
-		return 0;
-	return 1;
+		return false;
+	return true;
 }
 
 static struct xt_target ipt_TTL = {
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 23b607b33b32..dfa7afd84763 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -328,25 +328,25 @@ static void ipt_logfn(unsigned int pf,
 	ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
 }
 
-static int ipt_ulog_checkentry(const char *tablename,
-			       const void *e,
-			       const struct xt_target *target,
-			       void *targinfo,
-			       unsigned int hookmask)
+static bool ipt_ulog_checkentry(const char *tablename,
+				const void *e,
+				const struct xt_target *target,
+				void *targinfo,
+				unsigned int hookmask)
 {
 	struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo;
 
 	if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
 		DEBUGP("ipt_ULOG: prefix term %i\n",
 		       loginfo->prefix[sizeof(loginfo->prefix) - 1]);
-		return 0;
+		return false;
 	}
 	if (loginfo->qthreshold > ULOG_MAX_QLEN) {
 		DEBUGP("ipt_ULOG: queue threshold %i > MAX_QLEN\n",
 			loginfo->qthreshold);
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index 6740736c5e79..fc3d9437beba 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -140,36 +140,36 @@ static unsigned int ipt_dnat_target(struct sk_buff **pskb,
 	return nf_nat_setup_info(ct, &mr->range[0], hooknum);
 }
 
-static int ipt_snat_checkentry(const char *tablename,
-			       const void *entry,
-			       const struct xt_target *target,
-			       void *targinfo,
-			       unsigned int hook_mask)
+static bool ipt_snat_checkentry(const char *tablename,
+				const void *entry,
+				const struct xt_target *target,
+				void *targinfo,
+				unsigned int hook_mask)
 {
 	struct nf_nat_multi_range_compat *mr = targinfo;
 
 	/* Must be a valid range */
 	if (mr->rangesize != 1) {
 		printk("SNAT: multiple ranges no longer supported\n");
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
-static int ipt_dnat_checkentry(const char *tablename,
-			       const void *entry,
-			       const struct xt_target *target,
-			       void *targinfo,
-			       unsigned int hook_mask)
+static bool ipt_dnat_checkentry(const char *tablename,
+				const void *entry,
+				const struct xt_target *target,
+				void *targinfo,
+				unsigned int hook_mask)
 {
 	struct nf_nat_multi_range_compat *mr = targinfo;
 
 	/* Must be a valid range */
 	if (mr->rangesize != 1) {
 		printk("DNAT: multiple ranges no longer supported\n");
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 inline unsigned int
diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c
index 4115a576ba25..82966c09fd64 100644
--- a/net/ipv6/netfilter/ip6t_HL.c
+++ b/net/ipv6/netfilter/ip6t_HL.c
@@ -58,7 +58,7 @@ static unsigned int ip6t_hl_target(struct sk_buff **pskb,
 	return XT_CONTINUE;
 }
 
-static int ip6t_hl_checkentry(const char *tablename,
+static bool ip6t_hl_checkentry(const char *tablename,
 		const void *entry,
 		const struct xt_target *target,
 		void *targinfo,
@@ -69,14 +69,14 @@ static int ip6t_hl_checkentry(const char *tablename,
 	if (info->mode > IP6T_HL_MAXMODE) {
 		printk(KERN_WARNING "ip6t_HL: invalid or unknown Mode %u\n",
 			info->mode);
-		return 0;
+		return false;
 	}
 	if ((info->mode != IP6T_HL_SET) && (info->hop_limit == 0)) {
 		printk(KERN_WARNING "ip6t_HL: increment/decrement doesn't "
 			"make sense with value 0\n");
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static struct xt_target ip6t_HL = {
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 5bb9cd349350..aa4b9a14a11c 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -448,24 +448,24 @@ ip6t_log_target(struct sk_buff **pskb,
 }
 
 
-static int ip6t_log_checkentry(const char *tablename,
-			       const void *entry,
-			       const struct xt_target *target,
-			       void *targinfo,
-			       unsigned int hook_mask)
+static bool ip6t_log_checkentry(const char *tablename,
+				const void *entry,
+				const struct xt_target *target,
+				void *targinfo,
+				unsigned int hook_mask)
 {
 	const struct ip6t_log_info *loginfo = targinfo;
 
 	if (loginfo->level >= 8) {
 		DEBUGP("LOG: level %u >= 8\n", loginfo->level);
-		return 0;
+		return false;
 	}
 	if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
 		DEBUGP("LOG: prefix term %i\n",
 		       loginfo->prefix[sizeof(loginfo->prefix)-1]);
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static struct xt_target ip6t_log_reg = {
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index cb3d2415a064..8639a0599bf5 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -221,27 +221,27 @@ static unsigned int reject6_target(struct sk_buff **pskb,
 	return NF_DROP;
 }
 
-static int check(const char *tablename,
-		 const void *entry,
-		 const struct xt_target *target,
-		 void *targinfo,
-		 unsigned int hook_mask)
+static bool check(const char *tablename,
+		  const void *entry,
+		  const struct xt_target *target,
+		  void *targinfo,
+		  unsigned int hook_mask)
 {
 	const struct ip6t_reject_info *rejinfo = targinfo;
 	const struct ip6t_entry *e = entry;
 
 	if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) {
 		printk("ip6t_REJECT: ECHOREPLY is not supported.\n");
-		return 0;
+		return false;
 	} else if (rejinfo->with == IP6T_TCP_RESET) {
 		/* Must specify that it's a TCP packet */
 		if (e->ipv6.proto != IPPROTO_TCP
 		    || (e->ipv6.invflags & XT_INV_PROTO)) {
 			DEBUGP("ip6t_REJECT: TCP_RESET illegal for non-tcp\n");
-			return 0;
+			return false;
 		}
 	}
-	return 1;
+	return true;
 }
 
 static struct xt_target ip6t_reject_reg = {
diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c
index b03ce009d0bf..4e8aa1b0cba2 100644
--- a/net/netfilter/xt_CONNMARK.c
+++ b/net/netfilter/xt_CONNMARK.c
@@ -76,7 +76,7 @@ target(struct sk_buff **pskb,
 	return XT_CONTINUE;
 }
 
-static int
+static bool
 checkentry(const char *tablename,
 	   const void *entry,
 	   const struct xt_target *target,
@@ -88,21 +88,21 @@ checkentry(const char *tablename,
 	if (nf_ct_l3proto_try_module_get(target->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
 				    "proto=%d\n", target->family);
-		return 0;
+		return false;
 	}
 	if (matchinfo->mode == XT_CONNMARK_RESTORE) {
 		if (strcmp(tablename, "mangle") != 0) {
 			printk(KERN_WARNING "CONNMARK: restore can only be "
 			       "called from \"mangle\" table, not \"%s\"\n",
 			       tablename);
-			return 0;
+			return false;
 		}
 	}
 	if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) {
 		printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n");
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static void
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
index 81c0c58bab47..ab2f0d016953 100644
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -85,16 +85,16 @@ static unsigned int target(struct sk_buff **pskb, const struct net_device *in,
 	return XT_CONTINUE;
 }
 
-static int checkentry(const char *tablename, const void *entry,
-		      const struct xt_target *target, void *targinfo,
-		      unsigned int hook_mask)
+static bool checkentry(const char *tablename, const void *entry,
+		       const struct xt_target *target, void *targinfo,
+		       unsigned int hook_mask)
 {
 	struct xt_connsecmark_target_info *info = targinfo;
 
 	if (nf_ct_l3proto_try_module_get(target->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
 				    "proto=%d\n", target->family);
-		return 0;
+		return false;
 	}
 	switch (info->mode) {
 	case CONNSECMARK_SAVE:
@@ -103,10 +103,10 @@ static int checkentry(const char *tablename, const void *entry,
 
 	default:
 		printk(KERN_INFO PFX "invalid mode: %hu\n", info->mode);
-		return 0;
+		return false;
 	}
 
-	return 1;
+	return true;
 }
 
 static void
diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
index 9f2f2201f6ae..2d779f6902dc 100644
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -66,19 +66,19 @@ static unsigned int target6(struct sk_buff **pskb,
 	return XT_CONTINUE;
 }
 
-static int checkentry(const char *tablename,
-		      const void *e_void,
-		      const struct xt_target *target,
-		      void *targinfo,
-		      unsigned int hook_mask)
+static bool checkentry(const char *tablename,
+		       const void *e_void,
+		       const struct xt_target *target,
+		       void *targinfo,
+		       unsigned int hook_mask)
 {
 	const u_int8_t dscp = ((struct xt_DSCP_info *)targinfo)->dscp;
 
 	if ((dscp > XT_DSCP_MAX)) {
 		printk(KERN_WARNING "DSCP: dscp %x out of range\n", dscp);
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static struct xt_target xt_dscp_target[] = {
diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c
index 43817808d865..bd9cdf29cc3b 100644
--- a/net/netfilter/xt_MARK.c
+++ b/net/netfilter/xt_MARK.c
@@ -65,7 +65,7 @@ target_v1(struct sk_buff **pskb,
 }
 
 
-static int
+static bool
 checkentry_v0(const char *tablename,
 	      const void *entry,
 	      const struct xt_target *target,
@@ -76,12 +76,12 @@ checkentry_v0(const char *tablename,
 
 	if (markinfo->mark > 0xffffffff) {
 		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
-static int
+static bool
 checkentry_v1(const char *tablename,
 	      const void *entry,
 	      const struct xt_target *target,
@@ -95,13 +95,13 @@ checkentry_v1(const char *tablename,
 	    && markinfo->mode != XT_MARK_OR) {
 		printk(KERN_WARNING "MARK: unknown mode %u\n",
 		       markinfo->mode);
-		return 0;
+		return false;
 	}
 	if (markinfo->mark > 0xffffffff) {
 		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index 901ed7abaa1b..0c6f2838cc98 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -38,7 +38,7 @@ nflog_target(struct sk_buff **pskb,
 	return XT_CONTINUE;
 }
 
-static int
+static bool
 nflog_checkentry(const char *tablename, const void *entry,
 		 const struct xt_target *target, void *targetinfo,
 		 unsigned int hookmask)
@@ -46,10 +46,10 @@ nflog_checkentry(const char *tablename, const void *entry,
 	struct xt_nflog_info *info = targetinfo;
 
 	if (info->flags & ~XT_NFLOG_MASK)
-		return 0;
+		return false;
 	if (info->prefix[sizeof(info->prefix) - 1] != '\0')
-		return 0;
-	return 1;
+		return false;
+	return true;
 }
 
 static struct xt_target xt_nflog_target[] = {
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 705f0e830a79..f3e78c592f3a 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -51,7 +51,7 @@ static unsigned int target(struct sk_buff **pskb, const struct net_device *in,
 	return XT_CONTINUE;
 }
 
-static int checkentry_selinux(struct xt_secmark_target_info *info)
+static bool checkentry_selinux(struct xt_secmark_target_info *info)
 {
 	int err;
 	struct xt_secmark_target_selinux_info *sel = &info->u.sel;
@@ -63,50 +63,50 @@ static int checkentry_selinux(struct xt_secmark_target_info *info)
 		if (err == -EINVAL)
 			printk(KERN_INFO PFX "invalid SELinux context \'%s\'\n",
 			       sel->selctx);
-		return 0;
+		return false;
 	}
 
 	if (!sel->selsid) {
 		printk(KERN_INFO PFX "unable to map SELinux context \'%s\'\n",
 		       sel->selctx);
-		return 0;
+		return false;
 	}
 
 	err = selinux_relabel_packet_permission(sel->selsid);
 	if (err) {
 		printk(KERN_INFO PFX "unable to obtain relabeling permission\n");
-		return 0;
+		return false;
 	}
 
-	return 1;
+	return true;
 }
 
-static int checkentry(const char *tablename, const void *entry,
-		      const struct xt_target *target, void *targinfo,
-		      unsigned int hook_mask)
+static bool checkentry(const char *tablename, const void *entry,
+		       const struct xt_target *target, void *targinfo,
+		       unsigned int hook_mask)
 {
 	struct xt_secmark_target_info *info = targinfo;
 
 	if (mode && mode != info->mode) {
 		printk(KERN_INFO PFX "mode already set to %hu cannot mix with "
 		       "rules for mode %hu\n", mode, info->mode);
-		return 0;
+		return false;
 	}
 
 	switch (info->mode) {
 	case SECMARK_MODE_SEL:
 		if (!checkentry_selinux(info))
-			return 0;
+			return false;
 		break;
 
 	default:
 		printk(KERN_INFO PFX "invalid mode: %hu\n", info->mode);
-		return 0;
+		return false;
 	}
 
 	if (!mode)
 		mode = info->mode;
-	return 1;
+	return true;
 }
 
 static struct xt_target xt_secmark_target[] = {
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 15fe8f649510..075051acb554 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -197,19 +197,19 @@ xt_tcpmss_target6(struct sk_buff **pskb,
 #define TH_SYN 0x02
 
 /* Must specify -p tcp --syn */
-static inline int find_syn_match(const struct xt_entry_match *m)
+static inline bool find_syn_match(const struct xt_entry_match *m)
 {
 	const struct xt_tcp *tcpinfo = (const struct xt_tcp *)m->data;
 
 	if (strcmp(m->u.kernel.match->name, "tcp") == 0 &&
 	    tcpinfo->flg_cmp & TH_SYN &&
 	    !(tcpinfo->invflags & XT_TCP_INV_FLAGS))
-		return 1;
+		return true;
 
-	return 0;
+	return false;
 }
 
-static int
+static bool
 xt_tcpmss_checkentry4(const char *tablename,
 		      const void *entry,
 		      const struct xt_target *target,
@@ -225,16 +225,16 @@ xt_tcpmss_checkentry4(const char *tablename,
 			   (1 << NF_IP_POST_ROUTING))) != 0) {
 		printk("xt_TCPMSS: path-MTU clamping only supported in "
 		       "FORWARD, OUTPUT and POSTROUTING hooks\n");
-		return 0;
+		return false;
 	}
 	if (IPT_MATCH_ITERATE(e, find_syn_match))
-		return 1;
+		return true;
 	printk("xt_TCPMSS: Only works on TCP SYN packets\n");
-	return 0;
+	return false;
 }
 
 #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
-static int
+static bool
 xt_tcpmss_checkentry6(const char *tablename,
 		      const void *entry,
 		      const struct xt_target *target,
@@ -250,12 +250,12 @@ xt_tcpmss_checkentry6(const char *tablename,
 			   (1 << NF_IP6_POST_ROUTING))) != 0) {
 		printk("xt_TCPMSS: path-MTU clamping only supported in "
 		       "FORWARD, OUTPUT and POSTROUTING hooks\n");
-		return 0;
+		return false;
 	}
 	if (IP6T_MATCH_ITERATE(e, find_syn_match))
-		return 1;
+		return true;
 	printk("xt_TCPMSS: Only works on TCP SYN packets\n");
-	return 0;
+	return false;
 }
 #endif
 
-- 
cgit v1.3-8-gc7d7


From 1b50b8a371e90a5e110f466e4ac02cf6b5f681de Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@gmx.de>
Date: Sat, 7 Jul 2007 22:20:36 -0700
Subject: [NETFILTER]: Add u32 match

Along comes... xt_u32, a revamped ipt_u32 from POM-NG,
Plus:

    *	2007-06-02: added ipv6 support

    *	2007-06-05: uses kmalloc for the big buffer

    *   2007-06-05: added inversion

    *   2007-06-20: use skb_copy_bits() and get rid of the big buffer
        and lock (suggested by Pablo Neira Ayuso)

Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/xt_u32.h |  40 ++++++++++++
 net/netfilter/Kconfig            |  13 ++++
 net/netfilter/Makefile           |   1 +
 net/netfilter/xt_u32.c           | 135 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 189 insertions(+)
 create mode 100644 include/linux/netfilter/xt_u32.h
 create mode 100644 net/netfilter/xt_u32.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter/xt_u32.h b/include/linux/netfilter/xt_u32.h
new file mode 100644
index 000000000000..9947f56cdbdd
--- /dev/null
+++ b/include/linux/netfilter/xt_u32.h
@@ -0,0 +1,40 @@
+#ifndef _XT_U32_H
+#define _XT_U32_H 1
+
+enum xt_u32_ops {
+	XT_U32_AND,
+	XT_U32_LEFTSH,
+	XT_U32_RIGHTSH,
+	XT_U32_AT,
+};
+
+struct xt_u32_location_element {
+	u_int32_t number;
+	u_int8_t nextop;
+};
+
+struct xt_u32_value_element {
+	u_int32_t min;
+	u_int32_t max;
+};
+
+/*
+ * Any way to allow for an arbitrary number of elements?
+ * For now, I settle with a limit of 10 each.
+ */
+#define XT_U32_MAXSIZE 10
+
+struct xt_u32_test {
+	struct xt_u32_location_element location[XT_U32_MAXSIZE+1];
+	struct xt_u32_value_element value[XT_U32_MAXSIZE+1];
+	u_int8_t nnums;
+	u_int8_t nvalues;
+};
+
+struct xt_u32 {
+	struct xt_u32_test tests[XT_U32_MAXSIZE+1];
+	u_int8_t ntests;
+	u_int8_t invert;
+};
+
+#endif /* _XT_U32_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index a567dae8e5fd..aa567faa2a88 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -635,6 +635,19 @@ config NETFILTER_XT_MATCH_TCPMSS
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NETFILTER_XT_MATCH_U32
+	tristate '"u32" match support'
+	depends on NETFILTER_XTABLES
+	---help---
+	  u32 allows you to extract quantities of up to 4 bytes from a packet,
+	  AND them with specified masks, shift them by specified amounts and
+	  test whether the results are in any of a set of specified ranges.
+	  The specification of what to extract is general enough to skip over
+	  headers with lengths stored in the packet, as in IP or TCP header
+	  lengths.
+
+	  Details and examples are in the kernel module source.
+
 config NETFILTER_XT_MATCH_HASHLIMIT
 	tristate '"hashlimit" match support'
 	depends on NETFILTER_XTABLES && (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index b2b5c7566b26..3cf5b9cd6fe1 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -72,4 +72,5 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o
diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c
new file mode 100644
index 000000000000..07068751a35c
--- /dev/null
+++ b/net/netfilter/xt_u32.c
@@ -0,0 +1,135 @@
+/*
+ *	xt_u32 - kernel module to match u32 packet content
+ *
+ *	Original author: Don Cohen <don@isis.cs3-inc.com>
+ *	© Jan Engelhardt <jengelh@gmx.de>, 2007
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_u32.h>
+
+static bool u32_match_it(const struct xt_u32 *data,
+			 const struct sk_buff *skb)
+{
+	const struct xt_u32_test *ct;
+	unsigned int testind;
+	unsigned int nnums;
+	unsigned int nvals;
+	unsigned int i;
+	u_int32_t pos;
+	u_int32_t val;
+	u_int32_t at;
+	int ret;
+
+	/*
+	 * Small example: "0 >> 28 == 4 && 8 & 0xFF0000 >> 16 = 6, 17"
+	 * (=IPv4 and (TCP or UDP)). Outer loop runs over the "&&" operands.
+	 */
+	for (testind = 0; testind < data->ntests; ++testind) {
+		ct  = &data->tests[testind];
+		at  = 0;
+		pos = ct->location[0].number;
+
+		if (skb->len < 4 || pos > skb->len - 4);
+			return false;
+
+		ret   = skb_copy_bits(skb, pos, &val, sizeof(val));
+		BUG_ON(ret < 0);
+		val   = ntohl(val);
+		nnums = ct->nnums;
+
+		/* Inner loop runs over "&", "<<", ">>" and "@" operands */
+		for (i = 1; i < nnums; ++i) {
+			u_int32_t number = ct->location[i].number;
+			switch (ct->location[i].nextop) {
+			case XT_U32_AND:
+				val &= number;
+				break;
+			case XT_U32_LEFTSH:
+				val <<= number;
+				break;
+			case XT_U32_RIGHTSH:
+				val >>= number;
+				break;
+			case XT_U32_AT:
+				if (at + val < at)
+					return false;
+				at += val;
+				pos = number;
+				if (at + 4 < at || skb->len < at + 4 ||
+				    pos > skb->len - at - 4)
+					return false;
+
+				ret = skb_copy_bits(skb, at + pos, &val,
+						    sizeof(val));
+				BUG_ON(ret < 0);
+				val = ntohl(val);
+				break;
+			}
+		}
+
+		/* Run over the "," and ":" operands */
+		nvals = ct->nvalues;
+		for (i = 0; i < nvals; ++i)
+			if (ct->value[i].min <= val && val <= ct->value[i].max)
+				break;
+
+		if (i >= ct->nvalues)
+			return false;
+	}
+
+	return true;
+}
+
+static bool u32_match(const struct sk_buff *skb,
+		      const struct net_device *in,
+		      const struct net_device *out,
+		      const struct xt_match *match, const void *matchinfo,
+		      int offset, unsigned int protoff, bool *hotdrop)
+{
+	const struct xt_u32 *data = matchinfo;
+	bool ret;
+
+	ret = u32_match_it(data, skb);
+	return ret ^ data->invert;
+}
+
+static struct xt_match u32_reg[] = {
+	{
+		.name       = "u32",
+		.family     = AF_INET,
+		.match      = u32_match,
+		.matchsize  = sizeof(struct xt_u32),
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "u32",
+		.family     = AF_INET6,
+		.match      = u32_match,
+		.matchsize  = sizeof(struct xt_u32),
+		.me         = THIS_MODULE,
+	},
+};
+
+static int __init xt_u32_init(void)
+{
+	return xt_register_matches(u32_reg, ARRAY_SIZE(u32_reg));
+}
+
+static void __exit xt_u32_exit(void)
+{
+	xt_unregister_matches(u32_reg, ARRAY_SIZE(u32_reg));
+}
+
+module_init(xt_u32_init);
+module_exit(xt_u32_exit);
+MODULE_AUTHOR("Jan Engelhardt <jengelh@gmx.de>");
+MODULE_DESCRIPTION("netfilter u32 match module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_u32");
+MODULE_ALIAS("ip6t_u32");
-- 
cgit v1.3-8-gc7d7


From ba9dda3ab5a865542e69dfe01edb2436857c9420 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Sat, 7 Jul 2007 22:21:23 -0700
Subject: [NETFILTER]: x_tables: add TRACE target

The TRACE target can be used to follow IP and IPv6 packets through
the ruleset.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick NcHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h          |   4 +-
 net/core/skbuff.c               |   8 +++
 net/ipv4/ip_output.c            |   4 ++
 net/ipv4/netfilter/ip_tables.c  | 127 +++++++++++++++++++++++++++++++++++----
 net/ipv6/ip6_output.c           |   4 ++
 net/ipv6/netfilter/ip6_tables.c | 128 ++++++++++++++++++++++++++++++++++++----
 net/netfilter/Kconfig           |  12 ++++
 net/netfilter/Makefile          |   1 +
 net/netfilter/xt_TRACE.c        |  53 +++++++++++++++++
 9 files changed, 314 insertions(+), 27 deletions(-)
 create mode 100644 net/netfilter/xt_TRACE.c

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2d6a14f5f2f1..625d73b07ab7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -227,6 +227,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@mark: Generic packet mark
  *	@nfct: Associated connection, if any
  *	@ipvs_property: skbuff is owned by ipvs
+ *	@nf_trace: netfilter packet trace flag
  *	@nfctinfo: Relationship of this skb to the connection
  *	@nfct_reasm: netfilter conntrack re-assembly pointer
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
@@ -278,7 +279,8 @@ struct sk_buff {
 				nfctinfo:3;
 	__u8			pkt_type:3,
 				fclone:2,
-				ipvs_property:1;
+				ipvs_property:1,
+				nf_trace:1;
 	__be16			protocol;
 
 	void			(*destructor)(struct sk_buff *skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6a41b96b3d37..0583e8498f13 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -428,6 +428,10 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 	n->destructor = NULL;
 	C(mark);
 	__nf_copy(n, skb);
+#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+	C(nf_trace);
+#endif
 #ifdef CONFIG_NET_SCHED
 	C(tc_index);
 #ifdef CONFIG_NET_CLS_ACT
@@ -485,6 +489,10 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->destructor = NULL;
 	new->mark	= old->mark;
 	__nf_copy(new, old);
+#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+	new->nf_trace	= old->nf_trace;
+#endif
 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 	new->ipvs_property = old->ipvs_property;
 #endif
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a7dd343d3a03..c9e2b5e6305e 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -399,6 +399,10 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->tc_index = from->tc_index;
 #endif
 	nf_copy(to, from);
+#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+	to->nf_trace = from->nf_trace;
+#endif
 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 	to->ipvs_property = from->ipvs_property;
 #endif
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 7962306df585..650ab52e9157 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -204,6 +204,112 @@ get_entry(void *base, unsigned int offset)
 	return (struct ipt_entry *)(base + offset);
 }
 
+/* All zeroes == unconditional rule. */
+static inline int
+unconditional(const struct ipt_ip *ip)
+{
+	unsigned int i;
+
+	for (i = 0; i < sizeof(*ip)/sizeof(__u32); i++)
+		if (((__u32 *)ip)[i])
+			return 0;
+
+	return 1;
+}
+
+#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+static const char *hooknames[] = {
+	[NF_IP_PRE_ROUTING]		= "PREROUTING",
+	[NF_IP_LOCAL_IN]		= "INPUT",
+	[NF_IP_FORWARD]			= "FORWARD",
+	[NF_IP_LOCAL_OUT]		= "OUTPUT",
+	[NF_IP_POST_ROUTING]		= "POSTROUTING",
+};
+
+enum nf_ip_trace_comments {
+	NF_IP_TRACE_COMMENT_RULE,
+	NF_IP_TRACE_COMMENT_RETURN,
+	NF_IP_TRACE_COMMENT_POLICY,
+};
+
+static const char *comments[] = {
+	[NF_IP_TRACE_COMMENT_RULE]	= "rule",
+	[NF_IP_TRACE_COMMENT_RETURN]	= "return",
+	[NF_IP_TRACE_COMMENT_POLICY]	= "policy",
+};
+
+static struct nf_loginfo trace_loginfo = {
+	.type = NF_LOG_TYPE_LOG,
+	.u = {
+		.log = {
+			.level = 4,
+			.logflags = NF_LOG_MASK,
+		},
+	},
+};
+
+static inline int
+get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e,
+		      char *hookname, char **chainname,
+		      char **comment, unsigned int *rulenum)
+{
+	struct ipt_standard_target *t = (void *)ipt_get_target(s);
+
+	if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) {
+		/* Head of user chain: ERROR target with chainname */
+		*chainname = t->target.data;
+		(*rulenum) = 0;
+	} else if (s == e) {
+		(*rulenum)++;
+
+		if (s->target_offset == sizeof(struct ipt_entry)
+		   && strcmp(t->target.u.kernel.target->name,
+			     IPT_STANDARD_TARGET) == 0
+		   && t->verdict < 0
+		   && unconditional(&s->ip)) {
+			/* Tail of chains: STANDARD target (return/policy) */
+			*comment = *chainname == hookname
+				? (char *)comments[NF_IP_TRACE_COMMENT_POLICY]
+				: (char *)comments[NF_IP_TRACE_COMMENT_RETURN];
+		}
+		return 1;
+	} else
+		(*rulenum)++;
+
+	return 0;
+}
+
+static void trace_packet(struct sk_buff *skb,
+			 unsigned int hook,
+			 const struct net_device *in,
+			 const struct net_device *out,
+			 char *tablename,
+			 struct xt_table_info *private,
+			 struct ipt_entry *e)
+{
+	void *table_base;
+	struct ipt_entry *root;
+	char *hookname, *chainname, *comment;
+	unsigned int rulenum = 0;
+
+	table_base = (void *)private->entries[smp_processor_id()];
+	root = get_entry(table_base, private->hook_entry[hook]);
+
+	hookname = chainname = (char *)hooknames[hook];
+	comment = (char *)comments[NF_IP_TRACE_COMMENT_RULE];
+
+	IPT_ENTRY_ITERATE(root,
+			  private->size - private->hook_entry[hook],
+			  get_chainname_rulenum,
+			  e, hookname, &chainname, &comment, &rulenum);
+
+	nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo,
+		      "TRACE: %s:%s:%s:%u ",
+		      tablename, chainname, comment, rulenum);
+}
+#endif
+
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
 ipt_do_table(struct sk_buff **pskb,
@@ -261,6 +367,14 @@ ipt_do_table(struct sk_buff **pskb,
 
 			t = ipt_get_target(e);
 			IP_NF_ASSERT(t->u.kernel.target);
+
+#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+			/* The packet is traced: log it */
+			if (unlikely((*pskb)->nf_trace))
+				trace_packet(*pskb, hook, in, out,
+					     table->name, private, e);
+#endif
 			/* Standard target? */
 			if (!t->u.kernel.target->target) {
 				int v;
@@ -341,19 +455,6 @@ ipt_do_table(struct sk_buff **pskb,
 #endif
 }
 
-/* All zeroes == unconditional rule. */
-static inline int
-unconditional(const struct ipt_ip *ip)
-{
-	unsigned int i;
-
-	for (i = 0; i < sizeof(*ip)/sizeof(__u32); i++)
-		if (((__u32 *)ip)[i])
-			return 0;
-
-	return 1;
-}
-
 /* Figures out from what hook each rule can be called: returns 0 if
    there are loops.  Puts hook bitmask in comefrom. */
 static int
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 31dafaf0b652..50d86e94d9ed 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -521,6 +521,10 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->tc_index = from->tc_index;
 #endif
 	nf_copy(to, from);
+#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+	to->nf_trace = from->nf_trace;
+#endif
 	skb_copy_secmark(to, from);
 }
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 7fe4d29708cb..4f93b79163aa 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -241,6 +241,113 @@ get_entry(void *base, unsigned int offset)
 	return (struct ip6t_entry *)(base + offset);
 }
 
+/* All zeroes == unconditional rule. */
+static inline int
+unconditional(const struct ip6t_ip6 *ipv6)
+{
+	unsigned int i;
+
+	for (i = 0; i < sizeof(*ipv6); i++)
+		if (((char *)ipv6)[i])
+			break;
+
+	return (i == sizeof(*ipv6));
+}
+
+#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+/* This cries for unification! */
+static const char *hooknames[] = {
+	[NF_IP6_PRE_ROUTING]		= "PREROUTING",
+	[NF_IP6_LOCAL_IN]		= "INPUT",
+	[NF_IP6_FORWARD]		= "FORWARD",
+	[NF_IP6_LOCAL_OUT]		= "OUTPUT",
+	[NF_IP6_POST_ROUTING]		= "POSTROUTING",
+};
+
+enum nf_ip_trace_comments {
+	NF_IP6_TRACE_COMMENT_RULE,
+	NF_IP6_TRACE_COMMENT_RETURN,
+	NF_IP6_TRACE_COMMENT_POLICY,
+};
+
+static const char *comments[] = {
+	[NF_IP6_TRACE_COMMENT_RULE]	= "rule",
+	[NF_IP6_TRACE_COMMENT_RETURN]	= "return",
+	[NF_IP6_TRACE_COMMENT_POLICY]	= "policy",
+};
+
+static struct nf_loginfo trace_loginfo = {
+	.type = NF_LOG_TYPE_LOG,
+	.u = {
+		.log = {
+			.level = 4,
+			.logflags = NF_LOG_MASK,
+		},
+	},
+};
+
+static inline int
+get_chainname_rulenum(struct ip6t_entry *s, struct ip6t_entry *e,
+		      char *hookname, char **chainname,
+		      char **comment, unsigned int *rulenum)
+{
+	struct ip6t_standard_target *t = (void *)ip6t_get_target(s);
+
+	if (strcmp(t->target.u.kernel.target->name, IP6T_ERROR_TARGET) == 0) {
+		/* Head of user chain: ERROR target with chainname */
+		*chainname = t->target.data;
+		(*rulenum) = 0;
+	} else if (s == e) {
+		(*rulenum)++;
+
+		if (s->target_offset == sizeof(struct ip6t_entry)
+		   && strcmp(t->target.u.kernel.target->name,
+			     IP6T_STANDARD_TARGET) == 0
+		   && t->verdict < 0
+		   && unconditional(&s->ipv6)) {
+			/* Tail of chains: STANDARD target (return/policy) */
+			*comment = *chainname == hookname
+				? (char *)comments[NF_IP6_TRACE_COMMENT_POLICY]
+				: (char *)comments[NF_IP6_TRACE_COMMENT_RETURN];
+		}
+		return 1;
+	} else
+		(*rulenum)++;
+
+	return 0;
+}
+
+static void trace_packet(struct sk_buff *skb,
+			 unsigned int hook,
+			 const struct net_device *in,
+			 const struct net_device *out,
+			 char *tablename,
+			 struct xt_table_info *private,
+			 struct ip6t_entry *e)
+{
+	void *table_base;
+	struct ip6t_entry *root;
+	char *hookname, *chainname, *comment;
+	unsigned int rulenum = 0;
+
+	table_base = (void *)private->entries[smp_processor_id()];
+	root = get_entry(table_base, private->hook_entry[hook]);
+
+	hookname = chainname = (char *)hooknames[hook];
+	comment = (char *)comments[NF_IP6_TRACE_COMMENT_RULE];
+
+	IP6T_ENTRY_ITERATE(root,
+			   private->size - private->hook_entry[hook],
+			   get_chainname_rulenum,
+			   e, hookname, &chainname, &comment, &rulenum);
+
+	nf_log_packet(AF_INET6, hook, skb, in, out, &trace_loginfo,
+		      "TRACE: %s:%s:%s:%u ",
+		      tablename, chainname, comment, rulenum);
+}
+#endif
+
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
 ip6t_do_table(struct sk_buff **pskb,
@@ -298,6 +405,14 @@ ip6t_do_table(struct sk_buff **pskb,
 
 			t = ip6t_get_target(e);
 			IP_NF_ASSERT(t->u.kernel.target);
+
+#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+			/* The packet is traced: log it */
+			if (unlikely((*pskb)->nf_trace))
+				trace_packet(*pskb, hook, in, out,
+					     table->name, private, e);
+#endif
 			/* Standard target? */
 			if (!t->u.kernel.target->target) {
 				int v;
@@ -377,19 +492,6 @@ ip6t_do_table(struct sk_buff **pskb,
 #endif
 }
 
-/* All zeroes == unconditional rule. */
-static inline int
-unconditional(const struct ip6t_ip6 *ipv6)
-{
-	unsigned int i;
-
-	for (i = 0; i < sizeof(*ipv6); i++)
-		if (((char *)ipv6)[i])
-			break;
-
-	return (i == sizeof(*ipv6));
-}
-
 /* Figures out from what hook each rule can be called: returns 0 if
    there are loops.  Puts hook bitmask in comefrom. */
 static int
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index aa567faa2a88..df5e8dab871d 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -343,6 +343,18 @@ config NETFILTER_XT_TARGET_NOTRACK
 	  If you want to compile it as a module, say M here and read
 	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
 
+config NETFILTER_XT_TARGET_TRACE
+	tristate  '"TRACE" target support'
+	depends on NETFILTER_XTABLES
+	depends on IP_NF_RAW || IP6_NF_RAW
+	help
+	  The TRACE target allows you to mark packets so that the kernel
+	  will log every rule which match the packets as those traverse
+	  the tables, chains, rules.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
 config NETFILTER_XT_TARGET_SECMARK
 	tristate '"SECMARK" target support'
 	depends on NETFILTER_XTABLES && NETWORK_SECMARK
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 3cf5b9cd6fe1..3b792687f001 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -44,6 +44,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c
new file mode 100644
index 000000000000..b82fc4680344
--- /dev/null
+++ b/net/netfilter/xt_TRACE.c
@@ -0,0 +1,53 @@
+/* This is a module which is used to mark packets for tracing.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_TRACE");
+MODULE_ALIAS("ip6t_TRACE");
+
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const struct xt_target *target,
+       const void *targinfo)
+{
+	(*pskb)->nf_trace = 1;
+	return XT_CONTINUE;
+}
+
+static struct xt_target xt_trace_target[] = {
+	{
+		.name		= "TRACE",
+		.family		= AF_INET,
+		.target		= target,
+		.table		= "raw",
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "TRACE",
+		.family		= AF_INET6,
+		.target		= target,
+		.table		= "raw",
+		.me		= THIS_MODULE,
+	},
+};
+
+static int __init xt_trace_init(void)
+{
+	return xt_register_targets(xt_trace_target,
+				   ARRAY_SIZE(xt_trace_target));
+}
+
+static void __exit xt_trace_fini(void)
+{
+	xt_unregister_targets(xt_trace_target, ARRAY_SIZE(xt_trace_target));
+}
+
+module_init(xt_trace_init);
+module_exit(xt_trace_fini);
-- 
cgit v1.3-8-gc7d7


From d3c3f4243e135b3d8c41d98be0cb2f54a4141abf Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sat, 7 Jul 2007 22:38:30 -0700
Subject: [NETFILTER]: ipt_CLUSTERIP: add compat code

Adjust structure size and don't expect pointers passed in from
userspace to be valid. Also replace an enum in an ABI structure
by a fixed size type.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ipt_CLUSTERIP.h |  4 +--
 net/ipv4/netfilter/ipt_CLUSTERIP.c           | 39 ++++++++++++++--------------
 2 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h b/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h
index d9bceedfb3dc..daf50be22c9d 100644
--- a/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h
+++ b/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h
@@ -18,13 +18,13 @@ struct clusterip_config;
 struct ipt_clusterip_tgt_info {
 
 	u_int32_t flags;
-	
+
 	/* only relevant for new ones */
 	u_int8_t clustermac[6];
 	u_int16_t num_total_nodes;
 	u_int16_t num_local_nodes;
 	u_int16_t local_nodes[CLUSTERIP_MAX_NODES];
-	enum clusterip_hashmode hash_mode;
+	u_int32_t hash_mode;
 	u_int32_t hash_initval;
 
 	struct clusterip_config *config;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 1cef3b09c326..1981acedbfe8 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -397,23 +397,7 @@ checkentry(const char *tablename,
 	/* FIXME: further sanity checks */
 
 	config = clusterip_config_find_get(e->ip.dst.s_addr, 1);
-	if (config) {
-		if (cipinfo->config != NULL) {
-			/* Case A: This is an entry that gets reloaded, since
-			 * it still has a cipinfo->config pointer. Simply
-			 * increase the entry refcount and return */
-			if (cipinfo->config != config) {
-				printk(KERN_ERR "CLUSTERIP: Reloaded entry "
-				       "has invalid config pointer!\n");
-				return false;
-			}
-		} else {
-			/* Case B: This is a new rule referring to an existing
-			 * clusterip config. */
-			cipinfo->config = config;
-		}
-	} else {
-		/* Case C: This is a completely new clusterip config */
+	if (!config) {
 		if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
 			printk(KERN_WARNING "CLUSTERIP: no config found for %u.%u.%u.%u, need 'new'\n", NIPQUAD(e->ip.dst.s_addr));
 			return false;
@@ -440,8 +424,8 @@ checkentry(const char *tablename,
 			}
 			dev_mc_add(config->dev,config->clustermac, ETH_ALEN, 0);
 		}
-		cipinfo->config = config;
 	}
+	cipinfo->config = config;
 
 	if (nf_ct_l3proto_try_module_get(target->family) < 0) {
 		printk(KERN_WARNING "can't load conntrack support for "
@@ -466,13 +450,30 @@ static void destroy(const struct xt_target *target, void *targinfo)
 	nf_ct_l3proto_module_put(target->family);
 }
 
+#ifdef CONFIG_COMPAT
+struct compat_ipt_clusterip_tgt_info
+{
+	u_int32_t	flags;
+	u_int8_t	clustermac[6];
+	u_int16_t	num_total_nodes;
+	u_int16_t	num_local_nodes;
+	u_int16_t	local_nodes[CLUSTERIP_MAX_NODES];
+	u_int32_t	hash_mode;
+	u_int32_t	hash_initval;
+	compat_uptr_t	config;
+};
+#endif /* CONFIG_COMPAT */
+
 static struct xt_target clusterip_tgt __read_mostly = {
 	.name		= "CLUSTERIP",
 	.family		= AF_INET,
 	.target		= target,
-	.targetsize	= sizeof(struct ipt_clusterip_tgt_info),
 	.checkentry	= checkentry,
 	.destroy	= destroy,
+	.targetsize	= sizeof(struct ipt_clusterip_tgt_info),
+#ifdef CONFIG_COMPAT
+	.compatsize	= sizeof(struct compat_ipt_clusterip_tgt_info),
+#endif /* CONFIG_COMPAT */
 	.me		= THIS_MODULE
 };
 
-- 
cgit v1.3-8-gc7d7


From 0d53778e81ac7af266dac8a20cc328328c327112 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sat, 7 Jul 2007 22:39:38 -0700
Subject: [NETFILTER]: Convert DEBUGP to pr_debug

Convert DEBUGP to pr_debug and fix lots of non-compiling debug statements.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nf_conntrack_pptp.h        |   2 +
 include/net/netfilter/nf_conntrack_tuple.h         |  10 +-
 net/ipv4/netfilter/ipt_CLUSTERIP.c                 |  27 ++--
 net/ipv4/netfilter/ipt_LOG.c                       |  12 +-
 net/ipv4/netfilter/ipt_MASQUERADE.c                |  10 +-
 net/ipv4/netfilter/ipt_NETMAP.c                    |  13 +-
 net/ipv4/netfilter/ipt_REDIRECT.c                  |  10 +-
 net/ipv4/netfilter/ipt_REJECT.c                    |  10 +-
 net/ipv4/netfilter/ipt_SAME.c                      |  53 +++----
 net/ipv4/netfilter/ipt_ULOG.c                      |  35 ++---
 net/ipv4/netfilter/ipt_iprange.c                   |  30 ++--
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c     |  22 +--
 .../netfilter/nf_conntrack_l3proto_ipv4_compat.c   |   6 -
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c       |  22 +--
 net/ipv4/netfilter/nf_nat_core.c                   |  15 +-
 net/ipv4/netfilter/nf_nat_ftp.c                    |  14 +-
 net/ipv4/netfilter/nf_nat_h323.c                   |  95 ++++++------
 net/ipv4/netfilter/nf_nat_helper.c                 |  40 +++--
 net/ipv4/netfilter/nf_nat_irc.c                    |  13 +-
 net/ipv4/netfilter/nf_nat_pptp.c                   |  37 ++---
 net/ipv4/netfilter/nf_nat_proto_gre.c              |  17 +--
 net/ipv4/netfilter/nf_nat_rule.c                   |  14 +-
 net/ipv4/netfilter/nf_nat_sip.c                    |   8 -
 net/ipv4/netfilter/nf_nat_standalone.c             |  16 +-
 net/ipv6/netfilter/ip6t_LOG.c                      |  12 +-
 net/ipv6/netfilter/ip6t_REJECT.c                   |  23 ++-
 net/ipv6/netfilter/ip6t_ah.c                       |  47 +++---
 net/ipv6/netfilter/ip6t_frag.c                     |  76 +++++-----
 net/ipv6/netfilter/ip6t_hbh.c                      |  45 +++---
 net/ipv6/netfilter/ip6t_rt.c                       |  82 +++++------
 net/ipv6/netfilter/ip6table_mangle.c               |   6 -
 net/ipv6/netfilter/ip6table_raw.c                  |   6 -
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c     |  10 +-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c     |  24 ++-
 net/ipv6/netfilter/nf_conntrack_reasm.c            |  52 +++----
 net/netfilter/nf_conntrack_core.c                  |  41 +++---
 net/netfilter/nf_conntrack_ftp.c                   |  94 ++++++------
 net/netfilter/nf_conntrack_h323_main.c             | 161 ++++++++++-----------
 net/netfilter/nf_conntrack_irc.c                   |  24 ++-
 net/netfilter/nf_conntrack_l3proto_generic.c       |   6 -
 net/netfilter/nf_conntrack_pptp.c                  |  71 +++++----
 net/netfilter/nf_conntrack_proto_gre.c             |  28 ++--
 net/netfilter/nf_conntrack_proto_sctp.c            |  92 ++++--------
 net/netfilter/nf_conntrack_proto_tcp.c             | 129 ++++++++---------
 net/netfilter/nf_conntrack_sane.c                  |  27 ++--
 net/netfilter/nf_conntrack_sip.c                   |  22 +--
 net/netfilter/nf_conntrack_standalone.c            |   6 -
 net/netfilter/nf_conntrack_tftp.c                  |  17 +--
 48 files changed, 662 insertions(+), 970 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_conntrack_pptp.h b/include/linux/netfilter/nf_conntrack_pptp.h
index 9d8144a488cd..c93061f33144 100644
--- a/include/linux/netfilter/nf_conntrack_pptp.h
+++ b/include/linux/netfilter/nf_conntrack_pptp.h
@@ -4,6 +4,8 @@
 
 #include <linux/netfilter/nf_conntrack_common.h>
 
+extern const char *pptp_msg_name[];
+
 /* state of the control session */
 enum pptp_ctrlsess_state {
 	PPTP_SESSION_NONE,			/* no session present */
diff --git a/include/net/netfilter/nf_conntrack_tuple.h b/include/net/netfilter/nf_conntrack_tuple.h
index 99934ab538e6..040dae5f0c9e 100644
--- a/include/net/netfilter/nf_conntrack_tuple.h
+++ b/include/net/netfilter/nf_conntrack_tuple.h
@@ -120,11 +120,11 @@ struct nf_conntrack_tuple_mask
 
 #ifdef __KERNEL__
 
-#define NF_CT_DUMP_TUPLE(tp)						    \
-DEBUGP("tuple %p: %u %u " NIP6_FMT " %hu -> " NIP6_FMT " %hu\n",	    \
-	(tp), (tp)->src.l3num, (tp)->dst.protonum,			    \
-	NIP6(*(struct in6_addr *)(tp)->src.u3.all), ntohs((tp)->src.u.all), \
-	NIP6(*(struct in6_addr *)(tp)->dst.u3.all), ntohs((tp)->dst.u.all))
+#define NF_CT_DUMP_TUPLE(tp)						     \
+pr_debug("tuple %p: %u %u " NIP6_FMT " %hu -> " NIP6_FMT " %hu\n",	     \
+	 (tp), (tp)->src.l3num, (tp)->dst.protonum,			     \
+	 NIP6(*(struct in6_addr *)(tp)->src.u3.all), ntohs((tp)->src.u.all), \
+	 NIP6(*(struct in6_addr *)(tp)->dst.u3.all), ntohs((tp)->dst.u.all))
 
 /* If we're the first tuple, it's the original dir. */
 #define NF_CT_DIRECTION(h)						\
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 1981acedbfe8..8bacda3f6f6c 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -30,14 +30,6 @@
 
 #define CLUSTERIP_VERSION "0.8"
 
-#define DEBUG_CLUSTERIP
-
-#ifdef DEBUG_CLUSTERIP
-#define DEBUGP	printk
-#else
-#define DEBUGP
-#endif
-
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
 MODULE_DESCRIPTION("iptables target for CLUSTERIP");
@@ -351,15 +343,15 @@ target(struct sk_buff **pskb,
 			break;
 	}
 
-#ifdef DEBUG_CLUSTERP
+#ifdef DEBUG
 	DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 #endif
-	DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark);
+	pr_debug("hash=%u ct_hash=%u ", hash, ct->mark);
 	if (!clusterip_responsible(cipinfo->config, hash)) {
-		DEBUGP("not responsible\n");
+		pr_debug("not responsible\n");
 		return NF_DROP;
 	}
-	DEBUGP("responsible\n");
+	pr_debug("responsible\n");
 
 	/* despite being received via linklayer multicast, this is
 	 * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */
@@ -490,7 +482,7 @@ struct arp_payload {
 	__be32 dst_ip;
 } __attribute__ ((packed));
 
-#ifdef CLUSTERIP_DEBUG
+#ifdef DEBUG
 static void arp_print(struct arp_payload *payload)
 {
 #define HBUFFERLEN 30
@@ -546,8 +538,9 @@ arp_mangle(unsigned int hook,
 	 * this wouldn't work, since we didn't subscribe the mcast group on
 	 * other interfaces */
 	if (c->dev != out) {
-		DEBUGP("CLUSTERIP: not mangling arp reply on different "
-		       "interface: cip'%s'-skb'%s'\n", c->dev->name, out->name);
+		pr_debug("CLUSTERIP: not mangling arp reply on different "
+			 "interface: cip'%s'-skb'%s'\n",
+			 c->dev->name, out->name);
 		clusterip_config_put(c);
 		return NF_ACCEPT;
 	}
@@ -555,8 +548,8 @@ arp_mangle(unsigned int hook,
 	/* mangle reply hardware address */
 	memcpy(payload->src_hw, c->clustermac, arp->ar_hln);
 
-#ifdef CLUSTERIP_DEBUG
-	DEBUGP(KERN_DEBUG "CLUSTERIP mangled arp reply: ");
+#ifdef DEBUG
+	pr_debug(KERN_DEBUG "CLUSTERIP mangled arp reply: ");
 	arp_print(payload);
 #endif
 
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 9bfce614ec28..5937ad150b9f 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -27,12 +27,6 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("iptables syslog logging module");
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 /* Use lock to serialize, so printks don't overlap */
 static DEFINE_SPINLOCK(log_lock);
 
@@ -452,12 +446,12 @@ static bool ipt_log_checkentry(const char *tablename,
 	const struct ipt_log_info *loginfo = targinfo;
 
 	if (loginfo->level >= 8) {
-		DEBUGP("LOG: level %u >= 8\n", loginfo->level);
+		pr_debug("LOG: level %u >= 8\n", loginfo->level);
 		return false;
 	}
 	if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
-		DEBUGP("LOG: prefix term %i\n",
-		       loginfo->prefix[sizeof(loginfo->prefix)-1]);
+		pr_debug("LOG: prefix term %i\n",
+			 loginfo->prefix[sizeof(loginfo->prefix)-1]);
 		return false;
 	}
 	return true;
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index bc033e0f424d..7c4e4be7c8b3 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -27,12 +27,6 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("iptables MASQUERADE target module");
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 /* Lock protects masq region inside conntrack */
 static DEFINE_RWLOCK(masq_lock);
 
@@ -47,11 +41,11 @@ masquerade_check(const char *tablename,
 	const struct nf_nat_multi_range_compat *mr = targinfo;
 
 	if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
-		DEBUGP("masquerade_check: bad MAP_IPS.\n");
+		pr_debug("masquerade_check: bad MAP_IPS.\n");
 		return false;
 	}
 	if (mr->rangesize != 1) {
-		DEBUGP("masquerade_check: bad rangesize %u.\n", mr->rangesize);
+		pr_debug("masquerade_check: bad rangesize %u\n", mr->rangesize);
 		return false;
 	}
 	return true;
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index 0a7ce15bbdd0..41a011d5a065 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -18,17 +18,10 @@
 #include <linux/netfilter/x_tables.h>
 #include <net/netfilter/nf_nat_rule.h>
 
-#define MODULENAME "NETMAP"
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
 MODULE_DESCRIPTION("iptables 1:1 NAT mapping of IP networks target");
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 static bool
 check(const char *tablename,
       const void *e,
@@ -39,11 +32,11 @@ check(const char *tablename,
 	const struct nf_nat_multi_range_compat *mr = targinfo;
 
 	if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) {
-		DEBUGP(MODULENAME":check: bad MAP_IPS.\n");
+		pr_debug("NETMAP:check: bad MAP_IPS.\n");
 		return false;
 	}
 	if (mr->rangesize != 1) {
-		DEBUGP(MODULENAME":check: bad rangesize %u.\n", mr->rangesize);
+		pr_debug("NETMAP:check: bad rangesize %u.\n", mr->rangesize);
 		return false;
 	}
 	return true;
@@ -86,7 +79,7 @@ target(struct sk_buff **pskb,
 }
 
 static struct xt_target target_module __read_mostly = {
-	.name 		= MODULENAME,
+	.name 		= "NETMAP",
 	.family		= AF_INET,
 	.target 	= target,
 	.targetsize	= sizeof(struct nf_nat_multi_range_compat),
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index 61e1e4772e37..6ac7a2373316 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -25,12 +25,6 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("iptables REDIRECT target module");
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 /* FIXME: Take multiple ranges --RR */
 static bool
 redirect_check(const char *tablename,
@@ -42,11 +36,11 @@ redirect_check(const char *tablename,
 	const struct nf_nat_multi_range_compat *mr = targinfo;
 
 	if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
-		DEBUGP("redirect_check: bad MAP_IPS.\n");
+		pr_debug("redirect_check: bad MAP_IPS.\n");
 		return false;
 	}
 	if (mr->rangesize != 1) {
-		DEBUGP("redirect_check: bad rangesize %u.\n", mr->rangesize);
+		pr_debug("redirect_check: bad rangesize %u.\n", mr->rangesize);
 		return false;
 	}
 	return true;
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index dd5432c3f365..cb038c8fbc9d 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -31,12 +31,6 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("iptables REJECT target module");
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 /* Send RST reply */
 static void send_reset(struct sk_buff *oldskb, int hook)
 {
@@ -227,13 +221,13 @@ static bool check(const char *tablename,
 	const struct ipt_entry *e = e_void;
 
 	if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
-		printk("REJECT: ECHOREPLY no longer supported.\n");
+		printk("ipt_REJECT: ECHOREPLY no longer supported.\n");
 		return false;
 	} else if (rejinfo->with == IPT_TCP_RESET) {
 		/* Must specify that it's a TCP packet */
 		if (e->ip.proto != IPPROTO_TCP
 		    || (e->ip.invflags & XT_INV_PROTO)) {
-			DEBUGP("REJECT: TCP_RESET invalid for non-tcp\n");
+			printk("ipt_REJECT: TCP_RESET invalid for non-tcp\n");
 			return false;
 		}
 	}
diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c
index 3a0d7dac0af8..97641f1a97f6 100644
--- a/net/ipv4/netfilter/ipt_SAME.c
+++ b/net/ipv4/netfilter/ipt_SAME.c
@@ -27,12 +27,6 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Martin Josefsson <gandalf@wlug.westbo.se>");
 MODULE_DESCRIPTION("iptables special SNAT module for consistent sourceip");
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 static bool
 same_check(const char *tablename,
 	      const void *e,
@@ -46,54 +40,52 @@ same_check(const char *tablename,
 	mr->ipnum = 0;
 
 	if (mr->rangesize < 1) {
-		DEBUGP("same_check: need at least one dest range.\n");
+		pr_debug("same_check: need at least one dest range.\n");
 		return false;
 	}
 	if (mr->rangesize > IPT_SAME_MAX_RANGE) {
-		DEBUGP("same_check: too many ranges specified, maximum "
-				"is %u ranges\n",
-				IPT_SAME_MAX_RANGE);
+		pr_debug("same_check: too many ranges specified, maximum "
+			 "is %u ranges\n", IPT_SAME_MAX_RANGE);
 		return false;
 	}
 	for (count = 0; count < mr->rangesize; count++) {
 		if (ntohl(mr->range[count].min_ip) >
 				ntohl(mr->range[count].max_ip)) {
-			DEBUGP("same_check: min_ip is larger than max_ip in "
-				"range `%u.%u.%u.%u-%u.%u.%u.%u'.\n",
-				NIPQUAD(mr->range[count].min_ip),
-				NIPQUAD(mr->range[count].max_ip));
+			pr_debug("same_check: min_ip is larger than max_ip in "
+				 "range `%u.%u.%u.%u-%u.%u.%u.%u'.\n",
+				 NIPQUAD(mr->range[count].min_ip),
+				 NIPQUAD(mr->range[count].max_ip));
 			return false;
 		}
 		if (!(mr->range[count].flags & IP_NAT_RANGE_MAP_IPS)) {
-			DEBUGP("same_check: bad MAP_IPS.\n");
+			pr_debug("same_check: bad MAP_IPS.\n");
 			return false;
 		}
 		rangeip = (ntohl(mr->range[count].max_ip) -
 					ntohl(mr->range[count].min_ip) + 1);
 		mr->ipnum += rangeip;
 
-		DEBUGP("same_check: range %u, ipnum = %u\n", count, rangeip);
+		pr_debug("same_check: range %u, ipnum = %u\n", count, rangeip);
 	}
-	DEBUGP("same_check: total ipaddresses = %u\n", mr->ipnum);
+	pr_debug("same_check: total ipaddresses = %u\n", mr->ipnum);
 
 	mr->iparray = kmalloc((sizeof(u_int32_t) * mr->ipnum), GFP_KERNEL);
 	if (!mr->iparray) {
-		DEBUGP("same_check: Couldn't allocate %u bytes "
-			"for %u ipaddresses!\n",
-			(sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
+		pr_debug("same_check: Couldn't allocate %Zu bytes "
+			 "for %u ipaddresses!\n",
+			 (sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
 		return false;
 	}
-	DEBUGP("same_check: Allocated %u bytes for %u ipaddresses.\n",
-			(sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
+	pr_debug("same_check: Allocated %Zu bytes for %u ipaddresses.\n",
+		 (sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
 
 	for (count = 0; count < mr->rangesize; count++) {
 		for (countess = ntohl(mr->range[count].min_ip);
 				countess <= ntohl(mr->range[count].max_ip);
 					countess++) {
 			mr->iparray[index] = countess;
-			DEBUGP("same_check: Added ipaddress `%u.%u.%u.%u' "
-				"in index %u.\n",
-				HIPQUAD(countess), index);
+			pr_debug("same_check: Added ipaddress `%u.%u.%u.%u' "
+				 "in index %u.\n", HIPQUAD(countess), index);
 			index++;
 		}
 	}
@@ -107,8 +99,8 @@ same_destroy(const struct xt_target *target, void *targinfo)
 
 	kfree(mr->iparray);
 
-	DEBUGP("same_destroy: Deallocated %u bytes for %u ipaddresses.\n",
-			(sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
+	pr_debug("same_destroy: Deallocated %Zu bytes for %u ipaddresses.\n",
+		 (sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
 }
 
 static unsigned int
@@ -146,10 +138,9 @@ same_target(struct sk_buff **pskb,
 
 	new_ip = htonl(same->iparray[aindex]);
 
-	DEBUGP("ipt_SAME: src=%u.%u.%u.%u dst=%u.%u.%u.%u, "
-			"new src=%u.%u.%u.%u\n",
-			NIPQUAD(t->src.ip), NIPQUAD(t->dst.ip),
-			NIPQUAD(new_ip));
+	pr_debug("ipt_SAME: src=%u.%u.%u.%u dst=%u.%u.%u.%u, "
+		 "new src=%u.%u.%u.%u\n",
+		 NIPQUAD(t->src.u3.ip), NIPQUAD(t->dst.u3.ip), NIPQUAD(new_ip));
 
 	/* Transfer from original range. */
 	newrange = ((struct nf_nat_range)
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 226750d1f89b..6ca43e4ca7e3 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -55,13 +55,6 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
 #define ULOG_NL_EVENT		111		/* Harald's favorite number */
 #define ULOG_MAXNLGROUPS	32		/* numer of nlgroups */
 
-#if 0
-#define DEBUGP(format, args...) printk("%s:%s:" format, \
-				       __FILE__, __FUNCTION__ , ## args)
-#else
-#define DEBUGP(format, args...)
-#endif
-
 #define PRINTR(format, args...) do { if (net_ratelimit()) printk(format , ## args); } while (0)
 
 static unsigned int nlbufsiz = NLMSG_GOODSIZE;
@@ -96,12 +89,12 @@ static void ulog_send(unsigned int nlgroupnum)
 	ulog_buff_t *ub = &ulog_buffers[nlgroupnum];
 
 	if (timer_pending(&ub->timer)) {
-		DEBUGP("ipt_ULOG: ulog_send: timer was pending, deleting\n");
+		pr_debug("ipt_ULOG: ulog_send: timer was pending, deleting\n");
 		del_timer(&ub->timer);
 	}
 
 	if (!ub->skb) {
-		DEBUGP("ipt_ULOG: ulog_send: nothing to send\n");
+		pr_debug("ipt_ULOG: ulog_send: nothing to send\n");
 		return;
 	}
 
@@ -110,8 +103,8 @@ static void ulog_send(unsigned int nlgroupnum)
 		ub->lastnlh->nlmsg_type = NLMSG_DONE;
 
 	NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
-	DEBUGP("ipt_ULOG: throwing %d packets to netlink group %u\n",
-		ub->qlen, nlgroupnum + 1);
+	pr_debug("ipt_ULOG: throwing %d packets to netlink group %u\n",
+		 ub->qlen, nlgroupnum + 1);
 	netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC);
 
 	ub->qlen = 0;
@@ -123,7 +116,7 @@ static void ulog_send(unsigned int nlgroupnum)
 /* timer function to flush queue in flushtimeout time */
 static void ulog_timer(unsigned long data)
 {
-	DEBUGP("ipt_ULOG: timer function called, calling ulog_send\n");
+	pr_debug("ipt_ULOG: timer function called, calling ulog_send\n");
 
 	/* lock to protect against somebody modifying our structure
 	 * from ipt_ulog_target at the same time */
@@ -204,8 +197,8 @@ static void ipt_ulog_packet(unsigned int hooknum,
 			goto alloc_failure;
 	}
 
-	DEBUGP("ipt_ULOG: qlen %d, qthreshold %d\n", ub->qlen,
-		loginfo->qthreshold);
+	pr_debug("ipt_ULOG: qlen %d, qthreshold %Zu\n", ub->qlen,
+		 loginfo->qthreshold);
 
 	/* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */
 	nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
@@ -334,13 +327,13 @@ static bool ipt_ulog_checkentry(const char *tablename,
 	const struct ipt_ulog_info *loginfo = targinfo;
 
 	if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
-		DEBUGP("ipt_ULOG: prefix term %i\n",
-		       loginfo->prefix[sizeof(loginfo->prefix) - 1]);
+		pr_debug("ipt_ULOG: prefix term %i\n",
+			 loginfo->prefix[sizeof(loginfo->prefix) - 1]);
 		return false;
 	}
 	if (loginfo->qthreshold > ULOG_MAX_QLEN) {
-		DEBUGP("ipt_ULOG: queue threshold %i > MAX_QLEN\n",
-			loginfo->qthreshold);
+		pr_debug("ipt_ULOG: queue threshold %Zu > MAX_QLEN\n",
+			 loginfo->qthreshold);
 		return false;
 	}
 	return true;
@@ -405,7 +398,7 @@ static int __init ipt_ulog_init(void)
 {
 	int ret, i;
 
-	DEBUGP("ipt_ULOG: init module\n");
+	pr_debug("ipt_ULOG: init module\n");
 
 	if (nlbufsiz > 128*1024) {
 		printk("Netlink buffer has to be <= 128kB\n");
@@ -437,7 +430,7 @@ static void __exit ipt_ulog_fini(void)
 	ulog_buff_t *ub;
 	int i;
 
-	DEBUGP("ipt_ULOG: cleanup_module\n");
+	pr_debug("ipt_ULOG: cleanup_module\n");
 
 	if (nflog)
 		nf_log_unregister(&ipt_ulog_logger);
@@ -448,7 +441,7 @@ static void __exit ipt_ulog_fini(void)
 	for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
 		ub = &ulog_buffers[i];
 		if (timer_pending(&ub->timer)) {
-			DEBUGP("timer was pending, deleting\n");
+			pr_debug("timer was pending, deleting\n");
 			del_timer(&ub->timer);
 		}
 
diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c
index 6a3a033a6808..0106dc955a69 100644
--- a/net/ipv4/netfilter/ipt_iprange.c
+++ b/net/ipv4/netfilter/ipt_iprange.c
@@ -17,12 +17,6 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
 MODULE_DESCRIPTION("iptables arbitrary IP range match module");
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 static bool
 match(const struct sk_buff *skb,
       const struct net_device *in,
@@ -38,12 +32,12 @@ match(const struct sk_buff *skb,
 		if ((ntohl(iph->saddr) < ntohl(info->src.min_ip)
 			  || ntohl(iph->saddr) > ntohl(info->src.max_ip))
 			 ^ !!(info->flags & IPRANGE_SRC_INV)) {
-			DEBUGP("src IP %u.%u.%u.%u NOT in range %s"
-			       "%u.%u.%u.%u-%u.%u.%u.%u\n",
-				NIPQUAD(iph->saddr),
-				info->flags & IPRANGE_SRC_INV ? "(INV) " : "",
-				NIPQUAD(info->src.min_ip),
-				NIPQUAD(info->src.max_ip));
+			pr_debug("src IP %u.%u.%u.%u NOT in range %s"
+				 "%u.%u.%u.%u-%u.%u.%u.%u\n",
+				 NIPQUAD(iph->saddr),
+				 info->flags & IPRANGE_SRC_INV ? "(INV) " : "",
+				 NIPQUAD(info->src.min_ip),
+				 NIPQUAD(info->src.max_ip));
 			return false;
 		}
 	}
@@ -51,12 +45,12 @@ match(const struct sk_buff *skb,
 		if ((ntohl(iph->daddr) < ntohl(info->dst.min_ip)
 			  || ntohl(iph->daddr) > ntohl(info->dst.max_ip))
 			 ^ !!(info->flags & IPRANGE_DST_INV)) {
-			DEBUGP("dst IP %u.%u.%u.%u NOT in range %s"
-			       "%u.%u.%u.%u-%u.%u.%u.%u\n",
-				NIPQUAD(iph->daddr),
-				info->flags & IPRANGE_DST_INV ? "(INV) " : "",
-				NIPQUAD(info->dst.min_ip),
-				NIPQUAD(info->dst.max_ip));
+			pr_debug("dst IP %u.%u.%u.%u NOT in range %s"
+				 "%u.%u.%u.%u-%u.%u.%u.%u\n",
+				 NIPQUAD(iph->daddr),
+				 info->flags & IPRANGE_DST_INV ? "(INV) " : "",
+				 NIPQUAD(info->dst.min_ip),
+				 NIPQUAD(info->dst.max_ip));
 			return false;
 		}
 	}
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index a103f597d446..3c5629938487 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -24,12 +24,6 @@
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 static int ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
 			     struct nf_conntrack_tuple *tuple)
 {
@@ -324,13 +318,13 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 
 	/* We only do TCP at the moment: is there a better way? */
 	if (strcmp(sk->sk_prot->name, "TCP")) {
-		DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
+		pr_debug("SO_ORIGINAL_DST: Not a TCP socket\n");
 		return -ENOPROTOOPT;
 	}
 
 	if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
-		DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
-		       *len, sizeof(struct sockaddr_in));
+		pr_debug("SO_ORIGINAL_DST: len %d not %Zu\n",
+			 *len, sizeof(struct sockaddr_in));
 		return -EINVAL;
 	}
 
@@ -346,17 +340,17 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 			.tuple.dst.u3.ip;
 		memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
 
-		DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
-		       NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+		pr_debug("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
+			 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
 		nf_ct_put(ct);
 		if (copy_to_user(user, &sin, sizeof(sin)) != 0)
 			return -EFAULT;
 		else
 			return 0;
 	}
-	DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
-	       NIPQUAD(tuple.src.u3.ip), ntohs(tuple.src.u.tcp.port),
-	       NIPQUAD(tuple.dst.u3.ip), ntohs(tuple.dst.u.tcp.port));
+	pr_debug("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
+		 NIPQUAD(tuple.src.u3.ip), ntohs(tuple.src.u.tcp.port),
+		 NIPQUAD(tuple.dst.u3.ip), ntohs(tuple.dst.u.tcp.port));
 	return -ENOENT;
 }
 
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index ab8e4c607b7a..434e08410879 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -18,12 +18,6 @@
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 #ifdef CONFIG_NF_CT_ACCT
 static unsigned int
 seq_print_counters(struct seq_file *s,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 91fb277045ef..0fe8fb0466ef 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -21,12 +21,6 @@
 
 static unsigned long nf_ct_icmp_timeout __read_mostly = 30*HZ;
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 static int icmp_pkt_to_tuple(const struct sk_buff *skb,
 			     unsigned int dataoff,
 			     struct nf_conntrack_tuple *tuple)
@@ -125,8 +119,8 @@ static int icmp_new(struct nf_conn *conntrack,
 	if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
 	    || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
 		/* Can't create a new ICMP `conn' with this. */
-		DEBUGP("icmp: can't create new conn with type %u\n",
-		       conntrack->tuplehash[0].tuple.dst.u.icmp.type);
+		pr_debug("icmp: can't create new conn with type %u\n",
+			 conntrack->tuplehash[0].tuple.dst.u.icmp.type);
 		NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple);
 		return 0;
 	}
@@ -159,8 +153,8 @@ icmp_error_message(struct sk_buff *skb,
 
 	/* Ignore ICMP's containing fragments (shouldn't happen) */
 	if (inside->ip.frag_off & htons(IP_OFFSET)) {
-		DEBUGP("icmp_error_message: fragment of proto %u\n",
-		       inside->ip.protocol);
+		pr_debug("icmp_error_message: fragment of proto %u\n",
+			 inside->ip.protocol);
 		return -NF_ACCEPT;
 	}
 
@@ -172,8 +166,8 @@ icmp_error_message(struct sk_buff *skb,
 	if (!nf_ct_get_tuple(skb, dataoff, dataoff + inside->ip.ihl*4, PF_INET,
 			     inside->ip.protocol, &origtuple,
 			     &nf_conntrack_l3proto_ipv4, innerproto)) {
-		DEBUGP("icmp_error_message: ! get_tuple p=%u",
-		       inside->ip.protocol);
+		pr_debug("icmp_error_message: ! get_tuple p=%u",
+			 inside->ip.protocol);
 		return -NF_ACCEPT;
 	}
 
@@ -181,7 +175,7 @@ icmp_error_message(struct sk_buff *skb,
 	   been preserved inside the ICMP. */
 	if (!nf_ct_invert_tuple(&innertuple, &origtuple,
 				&nf_conntrack_l3proto_ipv4, innerproto)) {
-		DEBUGP("icmp_error_message: no match\n");
+		pr_debug("icmp_error_message: no match\n");
 		return -NF_ACCEPT;
 	}
 
@@ -196,7 +190,7 @@ icmp_error_message(struct sk_buff *skb,
 			h = nf_conntrack_find_get(&origtuple);
 
 		if (!h) {
-			DEBUGP("icmp_error_message: no match\n");
+			pr_debug("icmp_error_message: no match\n");
 			return -NF_ACCEPT;
 		}
 
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index f242ac61b3eb..e848d8d6292f 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -31,12 +31,6 @@
 #include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 static DEFINE_RWLOCK(nf_nat_lock);
 
 static struct nf_conntrack_l3proto *l3proto = NULL;
@@ -242,7 +236,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
 	   manips not an issue.  */
 	if (maniptype == IP_NAT_MANIP_SRC) {
 		if (find_appropriate_src(orig_tuple, tuple, range)) {
-			DEBUGP("get_unique_tuple: Found current src map\n");
+			pr_debug("get_unique_tuple: Found current src map\n");
 			if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM))
 				if (!nf_nat_used_tuple(tuple, ct))
 					return;
@@ -293,7 +287,7 @@ nf_nat_setup_info(struct nf_conn *ct,
 	if (!nat) {
 		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
 		if (nat == NULL) {
-			DEBUGP("failed to add NAT extension\n");
+			pr_debug("failed to add NAT extension\n");
 			return NF_ACCEPT;
 		}
 	}
@@ -462,8 +456,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
 			return 0;
 	}
 
-	DEBUGP("icmp_reply_translation: translating error %p manp %u dir %s\n",
-	       *pskb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
+	pr_debug("icmp_reply_translation: translating error %p manip %u "
+		 "dir %s\n", *pskb, manip,
+		 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
 
 	/* rcu_read_lock()ed by nf_hook_slow */
 	l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol);
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
index cae4b460aee1..3663bd879c39 100644
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ b/net/ipv4/netfilter/nf_nat_ftp.c
@@ -25,12 +25,6 @@ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
 MODULE_DESCRIPTION("ftp NAT helper");
 MODULE_ALIAS("ip_nat_ftp");
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 /* FIXME: Time out? --RR */
 
 static int
@@ -47,7 +41,7 @@ mangle_rfc959_packet(struct sk_buff **pskb,
 	sprintf(buffer, "%u,%u,%u,%u,%u,%u",
 		NIPQUAD(newip), port>>8, port&0xFF);
 
-	DEBUGP("calling nf_nat_mangle_tcp_packet\n");
+	pr_debug("calling nf_nat_mangle_tcp_packet\n");
 
 	return nf_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff,
 					matchlen, buffer, strlen(buffer));
@@ -67,7 +61,7 @@ mangle_eprt_packet(struct sk_buff **pskb,
 
 	sprintf(buffer, "|1|%u.%u.%u.%u|%u|", NIPQUAD(newip), port);
 
-	DEBUGP("calling nf_nat_mangle_tcp_packet\n");
+	pr_debug("calling nf_nat_mangle_tcp_packet\n");
 
 	return nf_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff,
 					matchlen, buffer, strlen(buffer));
@@ -87,7 +81,7 @@ mangle_epsv_packet(struct sk_buff **pskb,
 
 	sprintf(buffer, "|||%u|", port);
 
-	DEBUGP("calling nf_nat_mangle_tcp_packet\n");
+	pr_debug("calling nf_nat_mangle_tcp_packet\n");
 
 	return nf_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff,
 					matchlen, buffer, strlen(buffer));
@@ -117,7 +111,7 @@ static unsigned int nf_nat_ftp(struct sk_buff **pskb,
 	int dir = CTINFO2DIR(ctinfo);
 	struct nf_conn *ct = exp->master;
 
-	DEBUGP("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
+	pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
 
 	/* Connection will come from wherever this packet goes, hence !dir */
 	newip = ct->tuplehash[!dir].tuple.dst.u3.ip;
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 3d760dd657c7..c1b059a73708 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -21,12 +21,6 @@
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <linux/netfilter/nf_conntrack_h323.h>
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 /****************************************************************************/
 static int set_addr(struct sk_buff **pskb,
 		    unsigned char **data, int dataoff,
@@ -126,12 +120,11 @@ static int set_sig_addr(struct sk_buff **pskb, struct nf_conn *ct,
 				    (ntohl(addr.ip) & 0xff000000) == 0x7f000000)
 					i = 0;
 
-				DEBUGP
-				    ("nf_nat_ras: set signal address "
-				     "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
-				     NIPQUAD(ip), port,
-				     NIPQUAD(ct->tuplehash[!dir].tuple.dst.
-					     ip), info->sig_port[!dir]);
+				pr_debug("nf_nat_ras: set signal address "
+					 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+					 NIPQUAD(addr.ip), port,
+					 NIPQUAD(ct->tuplehash[!dir].tuple.dst.u3.ip),
+					 info->sig_port[!dir]);
 				return set_h225_addr(pskb, data, 0, &taddr[i],
 						     &ct->tuplehash[!dir].
 						     tuple.dst.u3,
@@ -139,12 +132,11 @@ static int set_sig_addr(struct sk_buff **pskb, struct nf_conn *ct,
 			} else if (addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip &&
 				   port == info->sig_port[dir]) {
 				/* GK->GW */
-				DEBUGP
-				    ("nf_nat_ras: set signal address "
-				     "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
-				     NIPQUAD(ip), port,
-				     NIPQUAD(ct->tuplehash[!dir].tuple.src.
-					     ip), info->sig_port[!dir]);
+				pr_debug("nf_nat_ras: set signal address "
+					 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+					 NIPQUAD(addr.ip), port,
+					 NIPQUAD(ct->tuplehash[!dir].tuple.src.u3.ip),
+					 info->sig_port[!dir]);
 				return set_h225_addr(pskb, data, 0, &taddr[i],
 						     &ct->tuplehash[!dir].
 						     tuple.src.u3,
@@ -171,12 +163,11 @@ static int set_ras_addr(struct sk_buff **pskb, struct nf_conn *ct,
 		if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) &&
 		    addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
 		    port == ct->tuplehash[dir].tuple.src.u.udp.port) {
-			DEBUGP("nf_nat_ras: set rasAddress "
-			       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
-			       NIPQUAD(ip), ntohs(port),
-			       NIPQUAD(ct->tuplehash[!dir].tuple.dst.u3.ip),
-			       ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.
-				     port));
+			pr_debug("nf_nat_ras: set rasAddress "
+				 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+				 NIPQUAD(addr.ip), ntohs(port),
+				 NIPQUAD(ct->tuplehash[!dir].tuple.dst.u3.ip),
+				 ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port));
 			return set_h225_addr(pskb, data, 0, &taddr[i],
 					     &ct->tuplehash[!dir].tuple.dst.u3,
 					     ct->tuplehash[!dir].tuple.
@@ -267,16 +258,16 @@ static int nat_rtp_rtcp(struct sk_buff **pskb, struct nf_conn *ct,
 	}
 
 	/* Success */
-	DEBUGP("nf_nat_h323: expect RTP %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
-	       NIPQUAD(rtp_exp->tuple.src.ip),
-	       ntohs(rtp_exp->tuple.src.u.udp.port),
-	       NIPQUAD(rtp_exp->tuple.dst.ip),
-	       ntohs(rtp_exp->tuple.dst.u.udp.port));
-	DEBUGP("nf_nat_h323: expect RTCP %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
-	       NIPQUAD(rtcp_exp->tuple.src.ip),
-	       ntohs(rtcp_exp->tuple.src.u.udp.port),
-	       NIPQUAD(rtcp_exp->tuple.dst.ip),
-	       ntohs(rtcp_exp->tuple.dst.u.udp.port));
+	pr_debug("nf_nat_h323: expect RTP %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+		 NIPQUAD(rtp_exp->tuple.src.u3.ip),
+		 ntohs(rtp_exp->tuple.src.u.udp.port),
+		 NIPQUAD(rtp_exp->tuple.dst.u3.ip),
+		 ntohs(rtp_exp->tuple.dst.u.udp.port));
+	pr_debug("nf_nat_h323: expect RTCP %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+		 NIPQUAD(rtcp_exp->tuple.src.u3.ip),
+		 ntohs(rtcp_exp->tuple.src.u.udp.port),
+		 NIPQUAD(rtcp_exp->tuple.dst.u3.ip),
+		 ntohs(rtcp_exp->tuple.dst.u.udp.port));
 
 	return 0;
 }
@@ -317,9 +308,11 @@ static int nat_t120(struct sk_buff **pskb, struct nf_conn *ct,
 		return -1;
 	}
 
-	DEBUGP("nf_nat_h323: expect T.120 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
-	       NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
-	       NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
+	pr_debug("nf_nat_h323: expect T.120 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+		 NIPQUAD(exp->tuple.src.u3.ip),
+		 ntohs(exp->tuple.src.u.tcp.port),
+		 NIPQUAD(exp->tuple.dst.u3.ip),
+		 ntohs(exp->tuple.dst.u.tcp.port));
 
 	return 0;
 }
@@ -369,9 +362,11 @@ static int nat_h245(struct sk_buff **pskb, struct nf_conn *ct,
 		return -1;
 	}
 
-	DEBUGP("nf_nat_q931: expect H.245 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
-	       NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
-	       NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
+	pr_debug("nf_nat_q931: expect H.245 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+		 NIPQUAD(exp->tuple.src.u3.ip),
+		 ntohs(exp->tuple.src.u.tcp.port),
+		 NIPQUAD(exp->tuple.dst.u3.ip),
+		 ntohs(exp->tuple.dst.u.tcp.port));
 
 	return 0;
 }
@@ -465,9 +460,11 @@ static int nat_q931(struct sk_buff **pskb, struct nf_conn *ct,
 	}
 
 	/* Success */
-	DEBUGP("nf_nat_ras: expect Q.931 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
-	       NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
-	       NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
+	pr_debug("nf_nat_ras: expect Q.931 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+		 NIPQUAD(exp->tuple.src.u3.ip),
+		 ntohs(exp->tuple.src.u.tcp.port),
+		 NIPQUAD(exp->tuple.dst.u3.ip),
+		 ntohs(exp->tuple.dst.u.tcp.port));
 
 	return 0;
 }
@@ -536,10 +533,12 @@ static int nat_callforwarding(struct sk_buff **pskb, struct nf_conn *ct,
 	}
 
 	/* Success */
-	DEBUGP("nf_nat_q931: expect Call Forwarding "
-	       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
-	       NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
-	       NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
+	pr_debug("nf_nat_q931: expect Call Forwarding "
+		 "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+		 NIPQUAD(exp->tuple.src.u3.ip),
+		 ntohs(exp->tuple.src.u.tcp.port),
+		 NIPQUAD(exp->tuple.dst.u3.ip),
+		 ntohs(exp->tuple.dst.u.tcp.port));
 
 	return 0;
 }
@@ -566,8 +565,6 @@ static int __init init(void)
 	rcu_assign_pointer(nat_h245_hook, nat_h245);
 	rcu_assign_pointer(nat_callforwarding_hook, nat_callforwarding);
 	rcu_assign_pointer(nat_q931_hook, nat_q931);
-
-	DEBUGP("nf_nat_h323: init success\n");
 	return 0;
 }
 
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index f3383fc14e1c..93d8a0a8f035 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -26,13 +26,9 @@
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_helper.h>
 
-#if 0
-#define DEBUGP printk
-#define DUMP_OFFSET(x)	printk("offset_before=%d, offset_after=%d, correction_pos=%u\n", x->offset_before, x->offset_after, x->correction_pos);
-#else
-#define DEBUGP(format, args...)
-#define DUMP_OFFSET(x)
-#endif
+#define DUMP_OFFSET(x) \
+	pr_debug("offset_before=%d, offset_after=%d, correction_pos=%u\n", \
+		 x->offset_before, x->offset_after, x->correction_pos);
 
 static DEFINE_SPINLOCK(nf_nat_seqofs_lock);
 
@@ -47,15 +43,15 @@ adjust_tcp_sequence(u32 seq,
 	struct nf_nat_seq *this_way, *other_way;
 	struct nf_conn_nat *nat = nfct_nat(ct);
 
-	DEBUGP("nf_nat_resize_packet: old_size = %u, new_size = %u\n",
-		(*skb)->len, new_size);
+	pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n",
+		 ntohl(seq), seq);
 
 	dir = CTINFO2DIR(ctinfo);
 
 	this_way = &nat->seq[dir];
 	other_way = &nat->seq[!dir];
 
-	DEBUGP("nf_nat_resize_packet: Seq_offset before: ");
+	pr_debug("nf_nat_resize_packet: Seq_offset before: ");
 	DUMP_OFFSET(this_way);
 
 	spin_lock_bh(&nf_nat_seqofs_lock);
@@ -72,7 +68,7 @@ adjust_tcp_sequence(u32 seq,
 	}
 	spin_unlock_bh(&nf_nat_seqofs_lock);
 
-	DEBUGP("nf_nat_resize_packet: Seq_offset after: ");
+	pr_debug("nf_nat_resize_packet: Seq_offset after: ");
 	DUMP_OFFSET(this_way);
 }
 
@@ -100,14 +96,12 @@ static void mangle_contents(struct sk_buff *skb,
 
 	/* update skb info */
 	if (rep_len > match_len) {
-		DEBUGP("nf_nat_mangle_packet: Extending packet by "
-		       "%u from %u bytes\n", rep_len - match_len,
-		       skb->len);
+		pr_debug("nf_nat_mangle_packet: Extending packet by "
+			 "%u from %u bytes\n", rep_len - match_len, skb->len);
 		skb_put(skb, rep_len - match_len);
 	} else {
-		DEBUGP("nf_nat_mangle_packet: Shrinking packet from "
-		       "%u from %u bytes\n", match_len - rep_len,
-		       skb->len);
+		pr_debug("nf_nat_mangle_packet: Shrinking packet from "
+			 "%u from %u bytes\n", match_len - rep_len, skb->len);
 		__skb_trim(skb, skb->len + rep_len - match_len);
 	}
 
@@ -320,9 +314,9 @@ sack_adjust(struct sk_buff *skb,
 			new_end_seq = htonl(ntohl(sack->end_seq)
 				      - natseq->offset_before);
 
-		DEBUGP("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
-			ntohl(sack->start_seq), new_start_seq,
-			ntohl(sack->end_seq), new_end_seq);
+		pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
+			 ntohl(sack->start_seq), new_start_seq,
+			 ntohl(sack->end_seq), new_end_seq);
 
 		nf_proto_csum_replace4(&tcph->check, skb,
 				       sack->start_seq, new_start_seq, 0);
@@ -414,9 +408,9 @@ nf_nat_seq_adjust(struct sk_buff **pskb,
 	nf_proto_csum_replace4(&tcph->check, *pskb, tcph->seq, newseq, 0);
 	nf_proto_csum_replace4(&tcph->check, *pskb, tcph->ack_seq, newack, 0);
 
-	DEBUGP("Adjusting sequence number from %u->%u, ack from %u->%u\n",
-		ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
-		ntohl(newack));
+	pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
+		 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
+		 ntohl(newack));
 
 	tcph->seq = newseq;
 	tcph->ack_seq = newack;
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
index db7fbf66fec0..bcf274bba602 100644
--- a/net/ipv4/netfilter/nf_nat_irc.c
+++ b/net/ipv4/netfilter/nf_nat_irc.c
@@ -22,12 +22,6 @@
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <linux/netfilter/nf_conntrack_irc.h>
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
 MODULE_DESCRIPTION("IRC (DCC) NAT helper");
 MODULE_LICENSE("GPL");
@@ -44,9 +38,6 @@ static unsigned int help(struct sk_buff **pskb,
 	u_int16_t port;
 	unsigned int ret;
 
-	DEBUGP("IRC_NAT: info (seq %u + %u) in %u\n",
-	       expect->seq, exp_irc_info->len, ntohl(tcph->seq));
-
 	/* Reply comes from server. */
 	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
 	exp->dir = IP_CT_DIR_REPLY;
@@ -64,8 +55,8 @@ static unsigned int help(struct sk_buff **pskb,
 
 	ip = ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip);
 	sprintf(buffer, "%u %u", ip, port);
-	DEBUGP("nf_nat_irc: inserting '%s' == %u.%u.%u.%u, port %u\n",
-	       buffer, NIPQUAD(ip), port);
+	pr_debug("nf_nat_irc: inserting '%s' == %u.%u.%u.%u, port %u\n",
+		 buffer, NIPQUAD(ip), port);
 
 	ret = nf_nat_mangle_tcp_packet(pskb, exp->master, ctinfo,
 				       matchoff, matchlen, buffer,
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index deb80ae2831e..984ec8308b2e 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -37,14 +37,6 @@ MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
 MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP");
 MODULE_ALIAS("ip_nat_pptp");
 
-#if 0
-extern const char *pptp_msg_name[];
-#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, \
-				       __FUNCTION__, ## args)
-#else
-#define DEBUGP(format, args...)
-#endif
-
 static void pptp_nat_expected(struct nf_conn *ct,
 			      struct nf_conntrack_expect *exp)
 {
@@ -60,7 +52,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
 
 	/* And here goes the grand finale of corrosion... */
 	if (exp->dir == IP_CT_DIR_ORIGINAL) {
-		DEBUGP("we are PNS->PAC\n");
+		pr_debug("we are PNS->PAC\n");
 		/* therefore, build tuple for PAC->PNS */
 		t.src.l3num = AF_INET;
 		t.src.u3.ip = master->tuplehash[!exp->dir].tuple.src.u3.ip;
@@ -69,7 +61,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
 		t.dst.u.gre.key = ct_pptp_info->pns_call_id;
 		t.dst.protonum = IPPROTO_GRE;
 	} else {
-		DEBUGP("we are PAC->PNS\n");
+		pr_debug("we are PAC->PNS\n");
 		/* build tuple for PNS->PAC */
 		t.src.l3num = AF_INET;
 		t.src.u3.ip = master->tuplehash[!exp->dir].tuple.src.u3.ip;
@@ -79,15 +71,15 @@ static void pptp_nat_expected(struct nf_conn *ct,
 		t.dst.protonum = IPPROTO_GRE;
 	}
 
-	DEBUGP("trying to unexpect other dir: ");
+	pr_debug("trying to unexpect other dir: ");
 	NF_CT_DUMP_TUPLE(&t);
 	other_exp = nf_ct_expect_find_get(&t);
 	if (other_exp) {
 		nf_ct_unexpect_related(other_exp);
 		nf_ct_expect_put(other_exp);
-		DEBUGP("success\n");
+		pr_debug("success\n");
 	} else {
-		DEBUGP("not found!\n");
+		pr_debug("not found!\n");
 	}
 
 	/* This must be a fresh one. */
@@ -161,9 +153,9 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 		cid_off = offsetof(union pptp_ctrl_union, clrreq.callID);
 		break;
 	default:
-		DEBUGP("unknown outbound packet 0x%04x:%s\n", msg,
-		      (msg <= PPTP_MSG_MAX)?
-		      pptp_msg_name[msg]:pptp_msg_name[0]);
+		pr_debug("unknown outbound packet 0x%04x:%s\n", msg,
+			 msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] :
+					       pptp_msg_name[0]);
 		/* fall through */
 	case PPTP_SET_LINK_INFO:
 		/* only need to NAT in case PAC is behind NAT box */
@@ -179,8 +171,8 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 
 	/* only OUT_CALL_REQUEST, IN_CALL_REPLY, CALL_CLEAR_REQUEST pass
 	 * down to here */
-	DEBUGP("altering call id from 0x%04x to 0x%04x\n",
-		ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));
+	pr_debug("altering call id from 0x%04x to 0x%04x\n",
+		 ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));
 
 	/* mangle packet */
 	if (nf_nat_mangle_tcp_packet(pskb, ct, ctinfo,
@@ -255,8 +247,9 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 		pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID);
 		break;
 	default:
-		DEBUGP("unknown inbound packet %s\n", (msg <= PPTP_MSG_MAX)?
-			pptp_msg_name[msg]:pptp_msg_name[0]);
+		pr_debug("unknown inbound packet %s\n",
+			 msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] :
+					       pptp_msg_name[0]);
 		/* fall through */
 	case PPTP_START_SESSION_REQUEST:
 	case PPTP_START_SESSION_REPLY:
@@ -272,8 +265,8 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 	 * WAN_ERROR_NOTIFY, CALL_DISCONNECT_NOTIFY pass down here */
 
 	/* mangle packet */
-	DEBUGP("altering peer call id from 0x%04x to 0x%04x\n",
-		ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid));
+	pr_debug("altering peer call id from 0x%04x to 0x%04x\n",
+		 ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid));
 
 	if (nf_nat_mangle_tcp_packet(pskb, ct, ctinfo,
 				     pcid_off + sizeof(struct pptp_pkt_hdr) +
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index c3908bc5a709..2e40cc83526a 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -36,13 +36,6 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
 MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
 
-#if 0
-#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s: " format, __FILE__, \
-				       __FUNCTION__, ## args)
-#else
-#define DEBUGP(x, args...)
-#endif
-
 /* is key in given range between min and max */
 static int
 gre_in_range(const struct nf_conntrack_tuple *tuple,
@@ -83,7 +76,7 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
 		keyptr = &tuple->dst.u.gre.key;
 
 	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
-		DEBUGP("%p: NATing GRE PPTP\n", conntrack);
+		pr_debug("%p: NATing GRE PPTP\n", conntrack);
 		min = 1;
 		range_size = 0xffff;
 	} else {
@@ -91,7 +84,7 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
 		range_size = ntohs(range->max.gre.key) - min + 1;
 	}
 
-	DEBUGP("min = %u, range_size = %u\n", min, range_size);
+	pr_debug("min = %u, range_size = %u\n", min, range_size);
 
 	for (i = 0; i < range_size; i++, key++) {
 		*keyptr = htons(min + key % range_size);
@@ -99,7 +92,7 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
 			return 1;
 	}
 
-	DEBUGP("%p: no NAT mapping\n", conntrack);
+	pr_debug("%p: no NAT mapping\n", conntrack);
 	return 0;
 }
 
@@ -132,11 +125,11 @@ gre_manip_pkt(struct sk_buff **pskb, unsigned int iphdroff,
 		 * Try to behave like "nf_nat_proto_unknown" */
 		break;
 	case GRE_VERSION_PPTP:
-		DEBUGP("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key));
+		pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key));
 		pgreh->call_id = tuple->dst.u.gre.key;
 		break;
 	default:
-		DEBUGP("can't nat unknown GRE version\n");
+		pr_debug("can't nat unknown GRE version\n");
 		return 0;
 	}
 	return 1;
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index 080393a143d7..0f45427e5fdc 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -24,12 +24,6 @@
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_rule.h>
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 #define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT))
 
 static struct
@@ -186,8 +180,8 @@ alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
 	struct nf_nat_range range
 		= { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } };
 
-	DEBUGP("Allocating NULL binding for %p (%u.%u.%u.%u)\n",
-	       ct, NIPQUAD(ip));
+	pr_debug("Allocating NULL binding for %p (%u.%u.%u.%u)\n",
+		 ct, NIPQUAD(ip));
 	return nf_nat_setup_info(ct, &range, hooknum);
 }
 
@@ -205,8 +199,8 @@ alloc_null_binding_confirmed(struct nf_conn *ct, unsigned int hooknum)
 	struct nf_nat_range range
 		= { IP_NAT_RANGE_MAP_IPS, ip, ip, { all }, { all } };
 
-	DEBUGP("Allocating NULL binding for confirmed %p (%u.%u.%u.%u)\n",
-	       ct, NIPQUAD(ip));
+	pr_debug("Allocating NULL binding for confirmed %p (%u.%u.%u.%u)\n",
+		 ct, NIPQUAD(ip));
 	return nf_nat_setup_info(ct, &range, hooknum);
 }
 
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 940cdfc429de..a889ec3ec83a 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -26,12 +26,6 @@ MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
 MODULE_DESCRIPTION("SIP NAT helper");
 MODULE_ALIAS("ip_nat_sip");
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 struct addr_map {
 	struct {
 		char		src[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
@@ -257,8 +251,6 @@ static unsigned int ip_nat_sdp(struct sk_buff **pskb,
 	__be32 newip;
 	u_int16_t port;
 
-	DEBUGP("ip_nat_sdp():\n");
-
 	/* Connection will come from reply */
 	if (ct->tuplehash[dir].tuple.src.u3.ip ==
 	    ct->tuplehash[!dir].tuple.dst.u3.ip)
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 30eeaa4c645c..332814dac503 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -27,12 +27,6 @@
 #include <net/netfilter/nf_nat_helper.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 #ifdef CONFIG_XFRM
 static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
 {
@@ -117,7 +111,7 @@ nf_nat_fn(unsigned int hooknum,
 	if (!nat) {
 		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
 		if (nat == NULL) {
-			DEBUGP("failed to add NAT extension\n");
+			pr_debug("failed to add NAT extension\n");
 			return NF_ACCEPT;
 		}
 	}
@@ -154,9 +148,9 @@ nf_nat_fn(unsigned int hooknum,
 				return ret;
 			}
 		} else
-			DEBUGP("Already setup manip %s for ct %p\n",
-			       maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
-			       ct);
+			pr_debug("Already setup manip %s for ct %p\n",
+				 maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
+				 ct);
 		break;
 
 	default:
@@ -270,7 +264,7 @@ nf_nat_adjust(unsigned int hooknum,
 
 	ct = nf_ct_get(*pskb, &ctinfo);
 	if (ct && test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) {
-		DEBUGP("nf_nat_standalone: adjusting sequence number\n");
+		pr_debug("nf_nat_standalone: adjusting sequence number\n");
 		if (!nf_nat_seq_adjust(pskb, ct, ctinfo))
 			return NF_DROP;
 	}
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 540bf14b851c..b05327ebd332 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -32,12 +32,6 @@ struct in_device;
 #include <net/route.h>
 #include <linux/netfilter_ipv6/ip6t_LOG.h>
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 /* Use lock to serialize, so printks don't overlap */
 static DEFINE_SPINLOCK(log_lock);
 
@@ -466,12 +460,12 @@ static bool ip6t_log_checkentry(const char *tablename,
 	const struct ip6t_log_info *loginfo = targinfo;
 
 	if (loginfo->level >= 8) {
-		DEBUGP("LOG: level %u >= 8\n", loginfo->level);
+		pr_debug("LOG: level %u >= 8\n", loginfo->level);
 		return false;
 	}
 	if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
-		DEBUGP("LOG: prefix term %i\n",
-		       loginfo->prefix[sizeof(loginfo->prefix)-1]);
+		pr_debug("LOG: prefix term %i\n",
+			 loginfo->prefix[sizeof(loginfo->prefix)-1]);
 		return false;
 	}
 	return true;
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index 14008dc6a197..2f487cda3b6b 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -34,12 +34,6 @@ MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>");
 MODULE_DESCRIPTION("IP6 tables REJECT target module");
 MODULE_LICENSE("GPL");
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 /* Send RST reply */
 static void send_reset(struct sk_buff *oldskb)
 {
@@ -54,7 +48,7 @@ static void send_reset(struct sk_buff *oldskb)
 
 	if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) ||
 	    (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) {
-		DEBUGP("ip6t_REJECT: addr is not unicast.\n");
+		pr_debug("ip6t_REJECT: addr is not unicast.\n");
 		return;
 	}
 
@@ -62,7 +56,7 @@ static void send_reset(struct sk_buff *oldskb)
 	tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto);
 
 	if ((tcphoff < 0) || (tcphoff > oldskb->len)) {
-		DEBUGP("ip6t_REJECT: Can't get TCP header.\n");
+		pr_debug("ip6t_REJECT: Can't get TCP header.\n");
 		return;
 	}
 
@@ -70,8 +64,9 @@ static void send_reset(struct sk_buff *oldskb)
 
 	/* IP header checks: fragment, too short. */
 	if (proto != IPPROTO_TCP || otcplen < sizeof(struct tcphdr)) {
-		DEBUGP("ip6t_REJECT: proto(%d) != IPPROTO_TCP, or too short. otcplen = %d\n",
-			proto, otcplen);
+		pr_debug("ip6t_REJECT: proto(%d) != IPPROTO_TCP, "
+			 "or too short. otcplen = %d\n",
+			 proto, otcplen);
 		return;
 	}
 
@@ -80,14 +75,14 @@ static void send_reset(struct sk_buff *oldskb)
 
 	/* No RST for RST. */
 	if (otcph.rst) {
-		DEBUGP("ip6t_REJECT: RST is set\n");
+		pr_debug("ip6t_REJECT: RST is set\n");
 		return;
 	}
 
 	/* Check checksum. */
 	if (csum_ipv6_magic(&oip6h->saddr, &oip6h->daddr, otcplen, IPPROTO_TCP,
 			    skb_checksum(oldskb, tcphoff, otcplen, 0))) {
-		DEBUGP("ip6t_REJECT: TCP checksum is invalid\n");
+		pr_debug("ip6t_REJECT: TCP checksum is invalid\n");
 		return;
 	}
 
@@ -186,7 +181,7 @@ static unsigned int reject6_target(struct sk_buff **pskb,
 {
 	const struct ip6t_reject_info *reject = targinfo;
 
-	DEBUGP(KERN_DEBUG "%s: medium point\n", __FUNCTION__);
+	pr_debug("%s: medium point\n", __FUNCTION__);
 	/* WARNING: This code causes reentry within ip6tables.
 	   This means that the ip6tables jump stack is now crap.  We
 	   must return an absolute verdict. --RR */
@@ -237,7 +232,7 @@ static bool check(const char *tablename,
 		/* Must specify that it's a TCP packet */
 		if (e->ipv6.proto != IPPROTO_TCP
 		    || (e->ipv6.invflags & XT_INV_PROTO)) {
-			DEBUGP("ip6t_REJECT: TCP_RESET illegal for non-tcp\n");
+			printk("ip6t_REJECT: TCP_RESET illegal for non-tcp\n");
 			return false;
 		}
 	}
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index a9fe2aa97072..2a25fe25e0e0 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -23,21 +23,16 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("IPv6 AH match");
 MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 /* Returns 1 if the spi is matched by the range, 0 otherwise */
 static inline bool
 spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
 {
 	bool r;
-	DEBUGP("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ',
-	       min,spi,max);
+
+	pr_debug("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",
+		 invert ? '!' : ' ', min, spi, max);
 	r = (spi >= min && spi <= max) ^ invert;
-	DEBUGP(" result %s\n",r? "PASS\n" : "FAILED\n");
+	pr_debug(" result %s\n", r ? "PASS" : "FAILED");
 	return r;
 }
 
@@ -73,22 +68,22 @@ match(const struct sk_buff *skb,
 
 	hdrlen = (ah->hdrlen + 2) << 2;
 
-	DEBUGP("IPv6 AH LEN %u %u ", hdrlen, ah->hdrlen);
-	DEBUGP("RES %04X ", ah->reserved);
-	DEBUGP("SPI %u %08X\n", ntohl(ah->spi), ntohl(ah->spi));
-
-	DEBUGP("IPv6 AH spi %02X ",
-	       spi_match(ahinfo->spis[0], ahinfo->spis[1],
-			 ntohl(ah->spi),
-			 !!(ahinfo->invflags & IP6T_AH_INV_SPI)));
-	DEBUGP("len %02X %04X %02X ",
-	       ahinfo->hdrlen, hdrlen,
-	       (!ahinfo->hdrlen ||
-		(ahinfo->hdrlen == hdrlen) ^
-		!!(ahinfo->invflags & IP6T_AH_INV_LEN)));
-	DEBUGP("res %02X %04X %02X\n",
-	       ahinfo->hdrres, ah->reserved,
-	       !(ahinfo->hdrres && ah->reserved));
+	pr_debug("IPv6 AH LEN %u %u ", hdrlen, ah->hdrlen);
+	pr_debug("RES %04X ", ah->reserved);
+	pr_debug("SPI %u %08X\n", ntohl(ah->spi), ntohl(ah->spi));
+
+	pr_debug("IPv6 AH spi %02X ",
+		 spi_match(ahinfo->spis[0], ahinfo->spis[1],
+			   ntohl(ah->spi),
+			   !!(ahinfo->invflags & IP6T_AH_INV_SPI)));
+	pr_debug("len %02X %04X %02X ",
+		 ahinfo->hdrlen, hdrlen,
+		 (!ahinfo->hdrlen ||
+		  (ahinfo->hdrlen == hdrlen) ^
+		  !!(ahinfo->invflags & IP6T_AH_INV_LEN)));
+	pr_debug("res %02X %04X %02X\n",
+		 ahinfo->hdrres, ah->reserved,
+		 !(ahinfo->hdrres && ah->reserved));
 
 	return (ah != NULL)
 	       &&
@@ -114,7 +109,7 @@ checkentry(const char *tablename,
 	const struct ip6t_ah *ahinfo = matchinfo;
 
 	if (ahinfo->invflags & ~IP6T_AH_INV_MASK) {
-		DEBUGP("ip6t_ah: unknown flags %X\n", ahinfo->invflags);
+		pr_debug("ip6t_ah: unknown flags %X\n", ahinfo->invflags);
 		return false;
 	}
 	return true;
diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c
index bb1cfa82b47c..968aeba02073 100644
--- a/net/ipv6/netfilter/ip6t_frag.c
+++ b/net/ipv6/netfilter/ip6t_frag.c
@@ -22,21 +22,15 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("IPv6 FRAG match");
 MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 /* Returns 1 if the id is matched by the range, 0 otherwise */
 static inline bool
 id_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert)
 {
 	bool r;
-	DEBUGP("frag id_match:%c 0x%x <= 0x%x <= 0x%x", invert ? '!' : ' ',
-	       min, id, max);
+	pr_debug("frag id_match:%c 0x%x <= 0x%x <= 0x%x", invert ? '!' : ' ',
+		 min, id, max);
 	r = (id >= min && id <= max) ^ invert;
-	DEBUGP(" result %s\n", r ? "PASS" : "FAILED");
+	pr_debug(" result %s\n", r ? "PASS" : "FAILED");
 	return r;
 }
 
@@ -69,37 +63,37 @@ match(const struct sk_buff *skb,
 		return false;
 	}
 
-	DEBUGP("INFO %04X ", fh->frag_off);
-	DEBUGP("OFFSET %04X ", ntohs(fh->frag_off) & ~0x7);
-	DEBUGP("RES %02X %04X", fh->reserved, ntohs(fh->frag_off) & 0x6);
-	DEBUGP("MF %04X ", fh->frag_off & htons(IP6_MF));
-	DEBUGP("ID %u %08X\n", ntohl(fh->identification),
-	       ntohl(fh->identification));
-
-	DEBUGP("IPv6 FRAG id %02X ",
-	       id_match(fraginfo->ids[0], fraginfo->ids[1],
-			 ntohl(fh->identification),
-			 !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)));
-	DEBUGP("res %02X %02X%04X %02X ",
-	       fraginfo->flags & IP6T_FRAG_RES, fh->reserved,
-	       ntohs(fh->frag_off) & 0x6,
-	       !((fraginfo->flags & IP6T_FRAG_RES)
-		 && (fh->reserved || (ntohs(fh->frag_off) & 0x06))));
-	DEBUGP("first %02X %02X %02X ",
-	       fraginfo->flags & IP6T_FRAG_FST,
-	       ntohs(fh->frag_off) & ~0x7,
-	       !((fraginfo->flags & IP6T_FRAG_FST)
-		 && (ntohs(fh->frag_off) & ~0x7)));
-	DEBUGP("mf %02X %02X %02X ",
-	       fraginfo->flags & IP6T_FRAG_MF,
-	       ntohs(fh->frag_off) & IP6_MF,
-	       !((fraginfo->flags & IP6T_FRAG_MF)
-		 && !((ntohs(fh->frag_off) & IP6_MF))));
-	DEBUGP("last %02X %02X %02X\n",
-	       fraginfo->flags & IP6T_FRAG_NMF,
-	       ntohs(fh->frag_off) & IP6_MF,
-	       !((fraginfo->flags & IP6T_FRAG_NMF)
-		 && (ntohs(fh->frag_off) & IP6_MF)));
+	pr_debug("INFO %04X ", fh->frag_off);
+	pr_debug("OFFSET %04X ", ntohs(fh->frag_off) & ~0x7);
+	pr_debug("RES %02X %04X", fh->reserved, ntohs(fh->frag_off) & 0x6);
+	pr_debug("MF %04X ", fh->frag_off & htons(IP6_MF));
+	pr_debug("ID %u %08X\n", ntohl(fh->identification),
+		 ntohl(fh->identification));
+
+	pr_debug("IPv6 FRAG id %02X ",
+		 id_match(fraginfo->ids[0], fraginfo->ids[1],
+			  ntohl(fh->identification),
+			  !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)));
+	pr_debug("res %02X %02X%04X %02X ",
+		 fraginfo->flags & IP6T_FRAG_RES, fh->reserved,
+		 ntohs(fh->frag_off) & 0x6,
+		 !((fraginfo->flags & IP6T_FRAG_RES)
+		   && (fh->reserved || (ntohs(fh->frag_off) & 0x06))));
+	pr_debug("first %02X %02X %02X ",
+		 fraginfo->flags & IP6T_FRAG_FST,
+		 ntohs(fh->frag_off) & ~0x7,
+		 !((fraginfo->flags & IP6T_FRAG_FST)
+		   && (ntohs(fh->frag_off) & ~0x7)));
+	pr_debug("mf %02X %02X %02X ",
+		 fraginfo->flags & IP6T_FRAG_MF,
+		 ntohs(fh->frag_off) & IP6_MF,
+		 !((fraginfo->flags & IP6T_FRAG_MF)
+		   && !((ntohs(fh->frag_off) & IP6_MF))));
+	pr_debug("last %02X %02X %02X\n",
+		 fraginfo->flags & IP6T_FRAG_NMF,
+		 ntohs(fh->frag_off) & IP6_MF,
+		 !((fraginfo->flags & IP6T_FRAG_NMF)
+		   && (ntohs(fh->frag_off) & IP6_MF)));
 
 	return (fh != NULL)
 	       &&
@@ -131,7 +125,7 @@ checkentry(const char *tablename,
 	const struct ip6t_frag *fraginfo = matchinfo;
 
 	if (fraginfo->invflags & ~IP6T_FRAG_INV_MASK) {
-		DEBUGP("ip6t_frag: unknown flags %X\n", fraginfo->invflags);
+		pr_debug("ip6t_frag: unknown flags %X\n", fraginfo->invflags);
 		return false;
 	}
 	return true;
diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c
index 6247d4cdad99..e6ca6018b1ea 100644
--- a/net/ipv6/netfilter/ip6t_hbh.c
+++ b/net/ipv6/netfilter/ip6t_hbh.c
@@ -25,12 +25,6 @@ MODULE_DESCRIPTION("IPv6 opts match");
 MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
 MODULE_ALIAS("ip6t_dst");
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 /*
  *  (Type & 0xC0) >> 6
  *	0	-> ignorable
@@ -90,13 +84,13 @@ match(const struct sk_buff *skb,
 		return false;
 	}
 
-	DEBUGP("IPv6 OPTS LEN %u %u ", hdrlen, oh->hdrlen);
+	pr_debug("IPv6 OPTS LEN %u %u ", hdrlen, oh->hdrlen);
 
-	DEBUGP("len %02X %04X %02X ",
-	       optinfo->hdrlen, hdrlen,
-	       (!(optinfo->flags & IP6T_OPTS_LEN) ||
-		((optinfo->hdrlen == hdrlen) ^
-		 !!(optinfo->invflags & IP6T_OPTS_INV_LEN))));
+	pr_debug("len %02X %04X %02X ",
+		 optinfo->hdrlen, hdrlen,
+		 (!(optinfo->flags & IP6T_OPTS_LEN) ||
+		  ((optinfo->hdrlen == hdrlen) ^
+		   !!(optinfo->invflags & IP6T_OPTS_INV_LEN))));
 
 	ret = (oh != NULL) &&
 	      (!(optinfo->flags & IP6T_OPTS_LEN) ||
@@ -108,10 +102,10 @@ match(const struct sk_buff *skb,
 	if (!(optinfo->flags & IP6T_OPTS_OPTS)) {
 		return ret;
 	} else if (optinfo->flags & IP6T_OPTS_NSTRICT) {
-		DEBUGP("Not strict - not implemented");
+		pr_debug("Not strict - not implemented");
 	} else {
-		DEBUGP("Strict ");
-		DEBUGP("#%d ", optinfo->optsnr);
+		pr_debug("Strict ");
+		pr_debug("#%d ", optinfo->optsnr);
 		for (temp = 0; temp < optinfo->optsnr; temp++) {
 			/* type field exists ? */
 			if (hdrlen < 1)
@@ -123,12 +117,11 @@ match(const struct sk_buff *skb,
 
 			/* Type check */
 			if (*tp != (optinfo->opts[temp] & 0xFF00) >> 8) {
-				DEBUGP("Tbad %02X %02X\n",
-				       *tp,
-				       (optinfo->opts[temp] & 0xFF00) >> 8);
+				pr_debug("Tbad %02X %02X\n", *tp,
+					 (optinfo->opts[temp] & 0xFF00) >> 8);
 				return false;
 			} else {
-				DEBUGP("Tok ");
+				pr_debug("Tok ");
 			}
 			/* Length check */
 			if (*tp) {
@@ -145,23 +138,23 @@ match(const struct sk_buff *skb,
 				spec_len = optinfo->opts[temp] & 0x00FF;
 
 				if (spec_len != 0x00FF && spec_len != *lp) {
-					DEBUGP("Lbad %02X %04X\n", *lp,
-					       spec_len);
+					pr_debug("Lbad %02X %04X\n", *lp,
+						 spec_len);
 					return false;
 				}
-				DEBUGP("Lok ");
+				pr_debug("Lok ");
 				optlen = *lp + 2;
 			} else {
-				DEBUGP("Pad1\n");
+				pr_debug("Pad1\n");
 				optlen = 1;
 			}
 
 			/* Step to the next */
-			DEBUGP("len%04X \n", optlen);
+			pr_debug("len%04X \n", optlen);
 
 			if ((ptr > skb->len - optlen || hdrlen < optlen) &&
 			    temp < optinfo->optsnr - 1) {
-				DEBUGP("new pointer is too large! \n");
+				pr_debug("new pointer is too large! \n");
 				break;
 			}
 			ptr += optlen;
@@ -187,7 +180,7 @@ checkentry(const char *tablename,
 	const struct ip6t_opts *optsinfo = matchinfo;
 
 	if (optsinfo->invflags & ~IP6T_OPTS_INV_MASK) {
-		DEBUGP("ip6t_opts: unknown flags %X\n", optsinfo->invflags);
+		pr_debug("ip6t_opts: unknown flags %X\n", optsinfo->invflags);
 		return false;
 	}
 	return true;
diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c
index 549deea26418..357cea703bd9 100644
--- a/net/ipv6/netfilter/ip6t_rt.c
+++ b/net/ipv6/netfilter/ip6t_rt.c
@@ -24,21 +24,15 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("IPv6 RT match");
 MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 /* Returns 1 if the id is matched by the range, 0 otherwise */
 static inline bool
 segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert)
 {
 	bool r;
-	DEBUGP("rt segsleft_match:%c 0x%x <= 0x%x <= 0x%x",
-	       invert ? '!' : ' ', min, id, max);
+	pr_debug("rt segsleft_match:%c 0x%x <= 0x%x <= 0x%x",
+		 invert ? '!' : ' ', min, id, max);
 	r = (id >= min && id <= max) ^ invert;
-	DEBUGP(" result %s\n", r ? "PASS" : "FAILED");
+	pr_debug(" result %s\n", r ? "PASS" : "FAILED");
 	return r;
 }
 
@@ -82,29 +76,29 @@ match(const struct sk_buff *skb,
 		return false;
 	}
 
-	DEBUGP("IPv6 RT LEN %u %u ", hdrlen, rh->hdrlen);
-	DEBUGP("TYPE %04X ", rh->type);
-	DEBUGP("SGS_LEFT %u %02X\n", rh->segments_left, rh->segments_left);
-
-	DEBUGP("IPv6 RT segsleft %02X ",
-	       segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1],
-			      rh->segments_left,
-			      !!(rtinfo->invflags & IP6T_RT_INV_SGS)));
-	DEBUGP("type %02X %02X %02X ",
-	       rtinfo->rt_type, rh->type,
-	       (!(rtinfo->flags & IP6T_RT_TYP) ||
-		((rtinfo->rt_type == rh->type) ^
-		 !!(rtinfo->invflags & IP6T_RT_INV_TYP))));
-	DEBUGP("len %02X %04X %02X ",
-	       rtinfo->hdrlen, hdrlen,
-	       !(rtinfo->flags & IP6T_RT_LEN) ||
-		((rtinfo->hdrlen == hdrlen) ^
-		 !!(rtinfo->invflags & IP6T_RT_INV_LEN)));
-	DEBUGP("res %02X %02X %02X ",
-	       rtinfo->flags & IP6T_RT_RES,
-	       ((const struct rt0_hdr *)rh)->reserved,
-	       !((rtinfo->flags & IP6T_RT_RES) &&
-		 (((const struct rt0_hdr *)rh)->reserved)));
+	pr_debug("IPv6 RT LEN %u %u ", hdrlen, rh->hdrlen);
+	pr_debug("TYPE %04X ", rh->type);
+	pr_debug("SGS_LEFT %u %02X\n", rh->segments_left, rh->segments_left);
+
+	pr_debug("IPv6 RT segsleft %02X ",
+		 segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1],
+				rh->segments_left,
+				!!(rtinfo->invflags & IP6T_RT_INV_SGS)));
+	pr_debug("type %02X %02X %02X ",
+		 rtinfo->rt_type, rh->type,
+		 (!(rtinfo->flags & IP6T_RT_TYP) ||
+		  ((rtinfo->rt_type == rh->type) ^
+		   !!(rtinfo->invflags & IP6T_RT_INV_TYP))));
+	pr_debug("len %02X %04X %02X ",
+		 rtinfo->hdrlen, hdrlen,
+		 !(rtinfo->flags & IP6T_RT_LEN) ||
+		  ((rtinfo->hdrlen == hdrlen) ^
+		   !!(rtinfo->invflags & IP6T_RT_INV_LEN)));
+	pr_debug("res %02X %02X %02X ",
+		 rtinfo->flags & IP6T_RT_RES,
+		 ((const struct rt0_hdr *)rh)->reserved,
+		 !((rtinfo->flags & IP6T_RT_RES) &&
+		   (((const struct rt0_hdr *)rh)->reserved)));
 
 	ret = (rh != NULL)
 	      &&
@@ -131,18 +125,18 @@ match(const struct sk_buff *skb,
 		ret = (*rp == 0);
 	}
 
-	DEBUGP("#%d ", rtinfo->addrnr);
+	pr_debug("#%d ", rtinfo->addrnr);
 	if (!(rtinfo->flags & IP6T_RT_FST)) {
 		return ret;
 	} else if (rtinfo->flags & IP6T_RT_FST_NSTRICT) {
-		DEBUGP("Not strict ");
+		pr_debug("Not strict ");
 		if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) {
-			DEBUGP("There isn't enough space\n");
+			pr_debug("There isn't enough space\n");
 			return false;
 		} else {
 			unsigned int i = 0;
 
-			DEBUGP("#%d ", rtinfo->addrnr);
+			pr_debug("#%d ", rtinfo->addrnr);
 			for (temp = 0;
 			     temp < (unsigned int)((hdrlen - 8) / 16);
 			     temp++) {
@@ -156,25 +150,25 @@ match(const struct sk_buff *skb,
 				BUG_ON(ap == NULL);
 
 				if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) {
-					DEBUGP("i=%d temp=%d;\n", i, temp);
+					pr_debug("i=%d temp=%d;\n", i, temp);
 					i++;
 				}
 				if (i == rtinfo->addrnr)
 					break;
 			}
-			DEBUGP("i=%d #%d\n", i, rtinfo->addrnr);
+			pr_debug("i=%d #%d\n", i, rtinfo->addrnr);
 			if (i == rtinfo->addrnr)
 				return ret;
 			else
 				return false;
 		}
 	} else {
-		DEBUGP("Strict ");
+		pr_debug("Strict ");
 		if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) {
-			DEBUGP("There isn't enough space\n");
+			pr_debug("There isn't enough space\n");
 			return false;
 		} else {
-			DEBUGP("#%d ", rtinfo->addrnr);
+			pr_debug("#%d ", rtinfo->addrnr);
 			for (temp = 0; temp < rtinfo->addrnr; temp++) {
 				ap = skb_header_pointer(skb,
 							ptr
@@ -187,7 +181,7 @@ match(const struct sk_buff *skb,
 				if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp]))
 					break;
 			}
-			DEBUGP("temp=%d #%d\n", temp, rtinfo->addrnr);
+			pr_debug("temp=%d #%d\n", temp, rtinfo->addrnr);
 			if (temp == rtinfo->addrnr &&
 			    temp == (unsigned int)((hdrlen - 8) / 16))
 				return ret;
@@ -210,14 +204,14 @@ checkentry(const char *tablename,
 	const struct ip6t_rt *rtinfo = matchinfo;
 
 	if (rtinfo->invflags & ~IP6T_RT_INV_MASK) {
-		DEBUGP("ip6t_rt: unknown flags %X\n", rtinfo->invflags);
+		pr_debug("ip6t_rt: unknown flags %X\n", rtinfo->invflags);
 		return false;
 	}
 	if ((rtinfo->flags & (IP6T_RT_RES | IP6T_RT_FST_MASK)) &&
 	    (!(rtinfo->flags & IP6T_RT_TYP) ||
 	     (rtinfo->rt_type != 0) ||
 	     (rtinfo->invflags & IP6T_RT_INV_TYP))) {
-		DEBUGP("`--rt-type 0' required before `--rt-0-*'");
+		pr_debug("`--rt-type 0' required before `--rt-0-*'");
 		return false;
 	}
 
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index f2d26495f413..f0a9efa67fb5 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -21,12 +21,6 @@ MODULE_DESCRIPTION("ip6tables mangle table");
 			    (1 << NF_IP6_LOCAL_OUT) | \
 			    (1 << NF_IP6_POST_ROUTING))
 
-#if 0
-#define DEBUGP(x, args...)	printk(KERN_DEBUG x, ## args)
-#else
-#define DEBUGP(x, args...)
-#endif
-
 static struct
 {
 	struct ip6t_replace repl;
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 0acda45d455d..ec290e4ebdd8 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -8,12 +8,6 @@
 
 #define RAW_VALID_HOOKS ((1 << NF_IP6_PRE_ROUTING) | (1 << NF_IP6_LOCAL_OUT))
 
-#if 0
-#define DEBUGP(x, args...)	printk(KERN_DEBUG x, ## args)
-#else
-#define DEBUGP(x, args...)
-#endif
-
 static struct
 {
 	struct ip6t_replace repl;
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 747b01e53132..89e20ab494b8 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -26,12 +26,6 @@
 #include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_core.h>
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 static int ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
 			     struct nf_conntrack_tuple *tuple)
 {
@@ -136,7 +130,7 @@ ipv6_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff,
 	 * except of IPv6 & ext headers. but it's tracked anyway. - YK
 	 */
 	if ((protoff < 0) || (protoff > (*pskb)->len)) {
-		DEBUGP("ip6_conntrack_core: can't find proto in pkt\n");
+		pr_debug("ip6_conntrack_core: can't find proto in pkt\n");
 		NF_CT_STAT_INC_ATOMIC(error);
 		NF_CT_STAT_INC_ATOMIC(invalid);
 		return -NF_ACCEPT;
@@ -178,7 +172,7 @@ static unsigned int ipv6_confirm(unsigned int hooknum,
 	protoff = nf_ct_ipv6_skip_exthdr(*pskb, extoff, &pnum,
 					 (*pskb)->len - extoff);
 	if (protoff > (*pskb)->len || pnum == NEXTHDR_FRAGMENT) {
-		DEBUGP("proto header not found\n");
+		pr_debug("proto header not found\n");
 		return NF_ACCEPT;
 	}
 
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index a514661d25dd..9defc7e14554 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -27,12 +27,6 @@
 
 static unsigned long nf_ct_icmpv6_timeout __read_mostly = 30*HZ;
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 static int icmpv6_pkt_to_tuple(const struct sk_buff *skb,
 			       unsigned int dataoff,
 			       struct nf_conntrack_tuple *tuple)
@@ -125,8 +119,8 @@ static int icmpv6_new(struct nf_conn *conntrack,
 
 	if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) {
 		/* Can't create a new ICMPv6 `conn' with this. */
-		DEBUGP("icmpv6: can't create new conn with type %u\n",
-		       type + 128);
+		pr_debug("icmpv6: can't create new conn with type %u\n",
+			 type + 128);
 		NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple);
 		return 0;
 	}
@@ -152,14 +146,15 @@ icmpv6_error_message(struct sk_buff *skb,
 
 	hp = skb_header_pointer(skb, icmp6off, sizeof(_hdr), &_hdr);
 	if (hp == NULL) {
-		DEBUGP("icmpv6_error: Can't get ICMPv6 hdr.\n");
+		pr_debug("icmpv6_error: Can't get ICMPv6 hdr.\n");
 		return -NF_ACCEPT;
 	}
 
 	inip6off = icmp6off + sizeof(_hdr);
 	if (skb_copy_bits(skb, inip6off+offsetof(struct ipv6hdr, nexthdr),
 			  &inprotonum, sizeof(inprotonum)) != 0) {
-		DEBUGP("icmpv6_error: Can't get nexthdr in inner IPv6 header.\n");
+		pr_debug("icmpv6_error: Can't get nexthdr in inner IPv6 "
+			 "header.\n");
 		return -NF_ACCEPT;
 	}
 	inprotoff = nf_ct_ipv6_skip_exthdr(skb,
@@ -169,7 +164,8 @@ icmpv6_error_message(struct sk_buff *skb,
 						    - sizeof(struct ipv6hdr));
 
 	if ((inprotoff > skb->len) || (inprotonum == NEXTHDR_FRAGMENT)) {
-		DEBUGP("icmpv6_error: Can't get protocol header in ICMPv6 payload.\n");
+		pr_debug("icmpv6_error: Can't get protocol header in ICMPv6 "
+			 "payload.\n");
 		return -NF_ACCEPT;
 	}
 
@@ -179,7 +175,7 @@ icmpv6_error_message(struct sk_buff *skb,
 	/* Are they talking about one of our connections? */
 	if (!nf_ct_get_tuple(skb, inip6off, inprotoff, PF_INET6, inprotonum,
 			     &origtuple, &nf_conntrack_l3proto_ipv6, inproto)) {
-		DEBUGP("icmpv6_error: Can't get tuple\n");
+		pr_debug("icmpv6_error: Can't get tuple\n");
 		return -NF_ACCEPT;
 	}
 
@@ -187,7 +183,7 @@ icmpv6_error_message(struct sk_buff *skb,
 	   been preserved inside the ICMP. */
 	if (!nf_ct_invert_tuple(&intuple, &origtuple,
 				&nf_conntrack_l3proto_ipv6, inproto)) {
-		DEBUGP("icmpv6_error: Can't invert tuple\n");
+		pr_debug("icmpv6_error: Can't invert tuple\n");
 		return -NF_ACCEPT;
 	}
 
@@ -195,7 +191,7 @@ icmpv6_error_message(struct sk_buff *skb,
 
 	h = nf_conntrack_find_get(&intuple);
 	if (!h) {
-		DEBUGP("icmpv6_error: no match\n");
+		pr_debug("icmpv6_error: no match\n");
 		return -NF_ACCEPT;
 	} else {
 		if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 347ab7608231..25442a8c1ba8 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -44,12 +44,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 #define NF_CT_FRAG6_HIGH_THRESH 262144 /* == 256*1024 */
 #define NF_CT_FRAG6_LOW_THRESH 196608  /* == 192*1024 */
 #define NF_CT_FRAG6_TIMEOUT IPV6_FRAG_TIMEOUT
@@ -343,7 +337,7 @@ nf_ct_frag6_create(unsigned int hash, __be32 id, struct in6_addr *src,				   str
 	struct nf_ct_frag6_queue *fq;
 
 	if ((fq = frag_alloc_queue()) == NULL) {
-		DEBUGP("Can't alloc new queue\n");
+		pr_debug("Can't alloc new queue\n");
 		goto oom;
 	}
 
@@ -393,7 +387,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
 	int offset, end;
 
 	if (fq->last_in & COMPLETE) {
-		DEBUGP("Allready completed\n");
+		pr_debug("Allready completed\n");
 		goto err;
 	}
 
@@ -402,7 +396,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
 			((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
 
 	if ((unsigned int)end > IPV6_MAXPLEN) {
-		DEBUGP("offset is too large.\n");
+		pr_debug("offset is too large.\n");
 		return -1;
 	}
 
@@ -420,7 +414,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
 		 */
 		if (end < fq->len ||
 		    ((fq->last_in & LAST_IN) && end != fq->len)) {
-			DEBUGP("already received last fragment\n");
+			pr_debug("already received last fragment\n");
 			goto err;
 		}
 		fq->last_in |= LAST_IN;
@@ -433,13 +427,13 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
 			/* RFC2460 says always send parameter problem in
 			 * this case. -DaveM
 			 */
-			DEBUGP("the end of this fragment is not rounded to 8 bytes.\n");
+			pr_debug("end of fragment not rounded to 8 bytes.\n");
 			return -1;
 		}
 		if (end > fq->len) {
 			/* Some bits beyond end -> corruption. */
 			if (fq->last_in & LAST_IN) {
-				DEBUGP("last packet already reached.\n");
+				pr_debug("last packet already reached.\n");
 				goto err;
 			}
 			fq->len = end;
@@ -451,11 +445,11 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
 
 	/* Point into the IP datagram 'data' part. */
 	if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) {
-		DEBUGP("queue: message is too short.\n");
+		pr_debug("queue: message is too short.\n");
 		goto err;
 	}
 	if (pskb_trim_rcsum(skb, end - offset)) {
-		DEBUGP("Can't trim\n");
+		pr_debug("Can't trim\n");
 		goto err;
 	}
 
@@ -480,11 +474,11 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
 		if (i > 0) {
 			offset += i;
 			if (end <= offset) {
-				DEBUGP("overlap\n");
+				pr_debug("overlap\n");
 				goto err;
 			}
 			if (!pskb_pull(skb, i)) {
-				DEBUGP("Can't pull\n");
+				pr_debug("Can't pull\n");
 				goto err;
 			}
 			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
@@ -503,7 +497,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
 			/* Eat head of the next overlapped fragment
 			 * and leave the loop. The next ones cannot overlap.
 			 */
-			DEBUGP("Eat head of the overlapped parts.: %d", i);
+			pr_debug("Eat head of the overlapped parts.: %d", i);
 			if (!pskb_pull(next, i))
 				goto err;
 
@@ -586,13 +580,13 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
 		       sizeof(struct ipv6hdr) + fq->len -
 		       sizeof(struct frag_hdr));
 	if (payload_len > IPV6_MAXPLEN) {
-		DEBUGP("payload len is too large.\n");
+		pr_debug("payload len is too large.\n");
 		goto out_oversize;
 	}
 
 	/* Head of list must not be cloned. */
 	if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) {
-		DEBUGP("skb is cloned but can't expand head");
+		pr_debug("skb is cloned but can't expand head");
 		goto out_oom;
 	}
 
@@ -604,7 +598,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
 		int i, plen = 0;
 
 		if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) {
-			DEBUGP("Can't alloc skb\n");
+			pr_debug("Can't alloc skb\n");
 			goto out_oom;
 		}
 		clone->next = head->next;
@@ -719,11 +713,11 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
 			return -1;
 		}
 		if (len < (int)sizeof(struct ipv6_opt_hdr)) {
-			DEBUGP("too short\n");
+			pr_debug("too short\n");
 			return -1;
 		}
 		if (nexthdr == NEXTHDR_NONE) {
-			DEBUGP("next header is none\n");
+			pr_debug("next header is none\n");
 			return -1;
 		}
 		if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
@@ -764,7 +758,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
 
 	/* Jumbo payload inhibits frag. header */
 	if (ipv6_hdr(skb)->payload_len == 0) {
-		DEBUGP("payload len = 0\n");
+		pr_debug("payload len = 0\n");
 		return skb;
 	}
 
@@ -773,14 +767,14 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
 
 	clone = skb_clone(skb, GFP_ATOMIC);
 	if (clone == NULL) {
-		DEBUGP("Can't clone skb\n");
+		pr_debug("Can't clone skb\n");
 		return skb;
 	}
 
 	NFCT_FRAG6_CB(clone)->orig = skb;
 
 	if (!pskb_may_pull(clone, fhoff + sizeof(*fhdr))) {
-		DEBUGP("message is too short.\n");
+		pr_debug("message is too short.\n");
 		goto ret_orig;
 	}
 
@@ -789,7 +783,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
 	fhdr = (struct frag_hdr *)skb_transport_header(clone);
 
 	if (!(fhdr->frag_off & htons(0xFFF9))) {
-		DEBUGP("Invalid fragment offset\n");
+		pr_debug("Invalid fragment offset\n");
 		/* It is not a fragmented frame */
 		goto ret_orig;
 	}
@@ -799,7 +793,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
 
 	fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr);
 	if (fq == NULL) {
-		DEBUGP("Can't find and can't create new queue\n");
+		pr_debug("Can't find and can't create new queue\n");
 		goto ret_orig;
 	}
 
@@ -807,7 +801,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
 
 	if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) {
 		spin_unlock(&fq->lock);
-		DEBUGP("Can't insert skb to queue\n");
+		pr_debug("Can't insert skb to queue\n");
 		fq_put(fq, NULL);
 		goto ret_orig;
 	}
@@ -815,7 +809,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
 	if (fq->last_in == (FIRST_IN|LAST_IN) && fq->meat == fq->len) {
 		ret_skb = nf_ct_frag6_reasm(fq, dev);
 		if (ret_skb == NULL)
-			DEBUGP("Can't reassemble fragmented packets\n");
+			pr_debug("Can't reassemble fragmented packets\n");
 	}
 	spin_unlock(&fq->lock);
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 472396dac05c..3d1411012a2c 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -40,12 +40,6 @@
 
 #define NF_CONNTRACK_VERSION	"0.5.0"
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 DEFINE_RWLOCK(nf_conntrack_lock);
 EXPORT_SYMBOL_GPL(nf_conntrack_lock);
 
@@ -141,7 +135,7 @@ EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
 static void
 clean_from_lists(struct nf_conn *ct)
 {
-	DEBUGP("clean_from_lists(%p)\n", ct);
+	pr_debug("clean_from_lists(%p)\n", ct);
 	hlist_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode);
 	hlist_del(&ct->tuplehash[IP_CT_DIR_REPLY].hnode);
 
@@ -155,7 +149,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	struct nf_conn *ct = (struct nf_conn *)nfct;
 	struct nf_conntrack_l4proto *l4proto;
 
-	DEBUGP("destroy_conntrack(%p)\n", ct);
+	pr_debug("destroy_conntrack(%p)\n", ct);
 	NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
 	NF_CT_ASSERT(!timer_pending(&ct->timeout));
 
@@ -194,7 +188,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	if (ct->master)
 		nf_ct_put(ct->master);
 
-	DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
+	pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
 	nf_conntrack_free(ct);
 }
 
@@ -313,7 +307,7 @@ __nf_conntrack_confirm(struct sk_buff **pskb)
 	/* No external references means noone else could have
 	   confirmed us. */
 	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
-	DEBUGP("Confirming conntrack %p\n", ct);
+	pr_debug("Confirming conntrack %p\n", ct);
 
 	write_lock_bh(&nf_conntrack_lock);
 
@@ -446,7 +440,7 @@ struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
 
 	conntrack = kmem_cache_zalloc(nf_conntrack_cachep, GFP_ATOMIC);
 	if (conntrack == NULL) {
-		DEBUGP("nf_conntrack_alloc: Can't alloc conntrack.\n");
+		pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");
 		atomic_dec(&nf_conntrack_count);
 		return ERR_PTR(-ENOMEM);
 	}
@@ -485,27 +479,27 @@ init_conntrack(const struct nf_conntrack_tuple *tuple,
 	struct nf_conntrack_expect *exp;
 
 	if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
-		DEBUGP("Can't invert tuple.\n");
+		pr_debug("Can't invert tuple.\n");
 		return NULL;
 	}
 
 	conntrack = nf_conntrack_alloc(tuple, &repl_tuple);
 	if (conntrack == NULL || IS_ERR(conntrack)) {
-		DEBUGP("Can't allocate conntrack.\n");
+		pr_debug("Can't allocate conntrack.\n");
 		return (struct nf_conntrack_tuple_hash *)conntrack;
 	}
 
 	if (!l4proto->new(conntrack, skb, dataoff)) {
 		nf_conntrack_free(conntrack);
-		DEBUGP("init conntrack: can't track with proto module\n");
+		pr_debug("init conntrack: can't track with proto module\n");
 		return NULL;
 	}
 
 	write_lock_bh(&nf_conntrack_lock);
 	exp = nf_ct_find_expectation(tuple);
 	if (exp) {
-		DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
-			conntrack, exp);
+		pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
+			 conntrack, exp);
 		/* Welcome, Mr. Bond.  We've been expecting you... */
 		__set_bit(IPS_EXPECTED_BIT, &conntrack->status);
 		conntrack->master = exp->master;
@@ -568,7 +562,7 @@ resolve_normal_ct(struct sk_buff *skb,
 	if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
 			     dataoff, l3num, protonum, &tuple, l3proto,
 			     l4proto)) {
-		DEBUGP("resolve_normal_ct: Can't get tuple\n");
+		pr_debug("resolve_normal_ct: Can't get tuple\n");
 		return NULL;
 	}
 
@@ -591,13 +585,14 @@ resolve_normal_ct(struct sk_buff *skb,
 	} else {
 		/* Once we've had two way comms, always ESTABLISHED. */
 		if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
-			DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
+			pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
 			*ctinfo = IP_CT_ESTABLISHED;
 		} else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
-			DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
+			pr_debug("nf_conntrack_in: related packet for %p\n",
+				 ct);
 			*ctinfo = IP_CT_RELATED;
 		} else {
-			DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
+			pr_debug("nf_conntrack_in: new packet for %p\n", ct);
 			*ctinfo = IP_CT_NEW;
 		}
 		*set_reply = 0;
@@ -629,7 +624,7 @@ nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
 	l3proto = __nf_ct_l3proto_find((u_int16_t)pf);
 
 	if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
-		DEBUGP("not prepared to track yet or error occured\n");
+		pr_debug("not prepared to track yet or error occured\n");
 		return -ret;
 	}
 
@@ -665,7 +660,7 @@ nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
 	if (ret < 0) {
 		/* Invalid: inverse of the return code tells
 		 * the netfilter core what to do */
-		DEBUGP("nf_conntrack_in: Can't track with proto module\n");
+		pr_debug("nf_conntrack_in: Can't track with proto module\n");
 		nf_conntrack_put((*pskb)->nfct);
 		(*pskb)->nfct = NULL;
 		NF_CT_STAT_INC_ATOMIC(invalid);
@@ -706,7 +701,7 @@ void nf_conntrack_alter_reply(struct nf_conn *ct,
 	/* Should be unconfirmed, so not in hash table yet */
 	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
 
-	DEBUGP("Altering reply tuple of %p to ", ct);
+	pr_debug("Altering reply tuple of %p to ", ct);
 	NF_CT_DUMP_TUPLE(newreply);
 
 	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index cd9c2d00cc09..c763ee74ea02 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -51,12 +51,6 @@ unsigned int (*nf_nat_ftp_hook)(struct sk_buff **pskb,
 				struct nf_conntrack_expect *exp);
 EXPORT_SYMBOL_GPL(nf_nat_ftp_hook);
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char);
 static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char);
 static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *,
@@ -138,13 +132,13 @@ static int try_number(const char *data, size_t dlen, u_int32_t array[],
 			if (*data == term && i == array_size - 1)
 				return len;
 
-			DEBUGP("Char %u (got %u nums) `%u' unexpected\n",
-			       len, i, *data);
+			pr_debug("Char %u (got %u nums) `%u' unexpected\n",
+				 len, i, *data);
 			return 0;
 		}
 	}
-	DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep);
-
+	pr_debug("Failed to fill %u numbers separated by %c\n",
+		 array_size, sep);
 	return 0;
 }
 
@@ -178,13 +172,13 @@ static int get_port(const char *data, int start, size_t dlen, char delim,
 			if (tmp_port == 0)
 				break;
 			*port = htons(tmp_port);
-			DEBUGP("get_port: return %d\n", tmp_port);
+			pr_debug("get_port: return %d\n", tmp_port);
 			return i + 1;
 		}
 		else if (data[i] >= '0' && data[i] <= '9')
 			tmp_port = tmp_port*10 + data[i] - '0';
 		else { /* Some other crap */
-			DEBUGP("get_port: invalid char.\n");
+			pr_debug("get_port: invalid char.\n");
 			break;
 		}
 	}
@@ -201,22 +195,22 @@ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd,
 	/* First character is delimiter, then "1" for IPv4 or "2" for IPv6,
 	   then delimiter again. */
 	if (dlen <= 3) {
-		DEBUGP("EPRT: too short\n");
+		pr_debug("EPRT: too short\n");
 		return 0;
 	}
 	delim = data[0];
 	if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) {
-		DEBUGP("try_eprt: invalid delimitter.\n");
+		pr_debug("try_eprt: invalid delimitter.\n");
 		return 0;
 	}
 
 	if ((cmd->l3num == PF_INET && data[1] != '1') ||
 	    (cmd->l3num == PF_INET6 && data[1] != '2')) {
-		DEBUGP("EPRT: invalid protocol number.\n");
+		pr_debug("EPRT: invalid protocol number.\n");
 		return 0;
 	}
 
-	DEBUGP("EPRT: Got %c%c%c\n", delim, data[1], delim);
+	pr_debug("EPRT: Got %c%c%c\n", delim, data[1], delim);
 
 	if (data[1] == '1') {
 		u_int32_t array[4];
@@ -234,7 +228,7 @@ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd,
 
 	if (length == 0)
 		return 0;
-	DEBUGP("EPRT: Got IP address!\n");
+	pr_debug("EPRT: Got IP address!\n");
 	/* Start offset includes initial "|1|", and trailing delimiter */
 	return get_port(data, 3 + length + 1, dlen, delim, &cmd->u.tcp.port);
 }
@@ -267,7 +261,7 @@ static int find_pattern(const char *data, size_t dlen,
 {
 	size_t i;
 
-	DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen);
+	pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen);
 	if (dlen == 0)
 		return 0;
 
@@ -282,17 +276,17 @@ static int find_pattern(const char *data, size_t dlen,
 #if 0
 		size_t i;
 
-		DEBUGP("ftp: string mismatch\n");
+		pr_debug("ftp: string mismatch\n");
 		for (i = 0; i < plen; i++) {
-			DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
-				i, data[i], data[i],
-				pattern[i], pattern[i]);
+			pr_debug("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
+				 i, data[i], data[i],
+				 pattern[i], pattern[i]);
 		}
 #endif
 		return 0;
 	}
 
-	DEBUGP("Pattern matches!\n");
+	pr_debug("Pattern matches!\n");
 	/* Now we've found the constant string, try to skip
 	   to the 'skip' character */
 	for (i = plen; data[i] != skip; i++)
@@ -301,14 +295,14 @@ static int find_pattern(const char *data, size_t dlen,
 	/* Skip over the last character */
 	i++;
 
-	DEBUGP("Skipped up to `%c'!\n", skip);
+	pr_debug("Skipped up to `%c'!\n", skip);
 
 	*numoff = i;
 	*numlen = getnum(data + i, dlen - i, cmd, term);
 	if (!*numlen)
 		return -1;
 
-	DEBUGP("Match succeeded!\n");
+	pr_debug("Match succeeded!\n");
 	return 1;
 }
 
@@ -373,7 +367,7 @@ static int help(struct sk_buff **pskb,
 	/* Until there's been traffic both ways, don't look in packets. */
 	if (ctinfo != IP_CT_ESTABLISHED
 	    && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) {
-		DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo);
+		pr_debug("ftp: Conntrackinfo = %u\n", ctinfo);
 		return NF_ACCEPT;
 	}
 
@@ -384,8 +378,8 @@ static int help(struct sk_buff **pskb,
 	dataoff = protoff + th->doff * 4;
 	/* No data? */
 	if (dataoff >= (*pskb)->len) {
-		DEBUGP("ftp: dataoff(%u) >= skblen(%u)\n", dataoff,
-			(*pskb)->len);
+		pr_debug("ftp: dataoff(%u) >= skblen(%u)\n", dataoff,
+			 (*pskb)->len);
 		return NF_ACCEPT;
 	}
 	datalen = (*pskb)->len - dataoff;
@@ -400,11 +394,11 @@ static int help(struct sk_buff **pskb,
 	/* Look up to see if we're just after a \n. */
 	if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) {
 		/* Now if this ends in \n, update ftp info. */
-		DEBUGP("nf_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n",
-		       ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)",
-		       ct_ftp_info->seq_aft_nl[dir][0],
-		       ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)",
-		       ct_ftp_info->seq_aft_nl[dir][1]);
+		pr_debug("nf_conntrack_ftp: wrong seq pos %s(%u) or %s(%u)\n",
+			 ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)",
+			 ct_ftp_info->seq_aft_nl[dir][0],
+			 ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)",
+			 ct_ftp_info->seq_aft_nl[dir][1]);
 		ret = NF_ACCEPT;
 		goto out_update_nl;
 	}
@@ -442,9 +436,9 @@ static int help(struct sk_buff **pskb,
 		goto out_update_nl;
 	}
 
-	DEBUGP("conntrack_ftp: match `%.*s' (%u bytes at %u)\n",
-	       (int)matchlen, fb_ptr + matchoff,
-	       matchlen, ntohl(th->seq) + matchoff);
+	pr_debug("conntrack_ftp: match `%.*s' (%u bytes at %u)\n",
+		 matchlen, fb_ptr + matchoff,
+		 matchlen, ntohl(th->seq) + matchoff);
 
 	exp = nf_ct_expect_alloc(ct);
 	if (exp == NULL) {
@@ -466,14 +460,16 @@ static int help(struct sk_buff **pskb,
 		   different IP address.  Simply don't record it for
 		   NAT. */
 		if (cmd.l3num == PF_INET) {
-			DEBUGP("conntrack_ftp: NOT RECORDING: " NIPQUAD_FMT " != " NIPQUAD_FMT "\n",
-			       NIPQUAD(cmd.u3.ip),
-			       NIPQUAD(ct->tuplehash[dir].tuple.src.u3.ip));
+			pr_debug("conntrack_ftp: NOT RECORDING: " NIPQUAD_FMT
+				 " != " NIPQUAD_FMT "\n",
+				 NIPQUAD(cmd.u3.ip),
+				 NIPQUAD(ct->tuplehash[dir].tuple.src.u3.ip));
 		} else {
-			DEBUGP("conntrack_ftp: NOT RECORDING: " NIP6_FMT " != " NIP6_FMT "\n",
-			       NIP6(*((struct in6_addr *)cmd.u3.ip6)),
-			       NIP6(*((struct in6_addr *)ct->tuplehash[dir]
-							.tuple.src.u3.ip6)));
+			pr_debug("conntrack_ftp: NOT RECORDING: " NIP6_FMT
+				 " != " NIP6_FMT "\n",
+				 NIP6(*((struct in6_addr *)cmd.u3.ip6)),
+				 NIP6(*((struct in6_addr *)
+					ct->tuplehash[dir].tuple.src.u3.ip6)));
 		}
 
 		/* Thanks to Cristiano Lincoln Mattos
@@ -530,9 +526,9 @@ static void nf_conntrack_ftp_fini(void)
 			if (ftp[i][j].me == NULL)
 				continue;
 
-			DEBUGP("nf_ct_ftp: unregistering helper for pf: %d "
-			       "port: %d\n",
-				ftp[i][j].tuple.src.l3num, ports[i]);
+			pr_debug("nf_ct_ftp: unregistering helper for pf: %d "
+				 "port: %d\n",
+				 ftp[i][j].tuple.src.l3num, ports[i]);
 			nf_conntrack_helper_unregister(&ftp[i][j]);
 		}
 	}
@@ -571,9 +567,9 @@ static int __init nf_conntrack_ftp_init(void)
 				sprintf(tmpname, "ftp-%d", ports[i]);
 			ftp[i][j].name = tmpname;
 
-			DEBUGP("nf_ct_ftp: registering helper for pf: %d "
-			       "port: %d\n",
-				ftp[i][j].tuple.src.l3num, ports[i]);
+			pr_debug("nf_ct_ftp: registering helper for pf: %d "
+				 "port: %d\n",
+				 ftp[i][j].tuple.src.l3num, ports[i]);
 			ret = nf_conntrack_helper_register(&ftp[i][j]);
 			if (ret) {
 				printk("nf_ct_ftp: failed to register helper "
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index aa5ba99b7a08..a8a9dfbe7a67 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -31,12 +31,6 @@
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <linux/netfilter/nf_conntrack_h323.h>
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 /* Parameters */
 static unsigned int default_rrq_ttl __read_mostly = 300;
 module_param(default_rrq_ttl, uint, 0600);
@@ -150,9 +144,9 @@ static int get_tpkt_data(struct sk_buff **pskb, unsigned int protoff,
 		if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) {
 			/* Netmeeting sends TPKT header and data separately */
 			if (info->tpkt_len[dir] > 0) {
-				DEBUGP("nf_ct_h323: previous packet "
-				       "indicated separate TPKT data of %hu "
-				       "bytes\n", info->tpkt_len[dir]);
+				pr_debug("nf_ct_h323: previous packet "
+					 "indicated separate TPKT data of %hu "
+					 "bytes\n", info->tpkt_len[dir]);
 				if (info->tpkt_len[dir] <= tcpdatalen) {
 					/* Yes, there was a TPKT header
 					 * received */
@@ -163,7 +157,7 @@ static int get_tpkt_data(struct sk_buff **pskb, unsigned int protoff,
 				}
 
 				/* Fragmented TPKT */
-				DEBUGP("nf_ct_h323: fragmented TPKT\n");
+				pr_debug("nf_ct_h323: fragmented TPKT\n");
 				goto clear_out;
 			}
 
@@ -190,9 +184,9 @@ static int get_tpkt_data(struct sk_buff **pskb, unsigned int protoff,
 	if (tpktlen > tcpdatalen) {
 		if (tcpdatalen == 4) {	/* Separate TPKT header */
 			/* Netmeeting sends TPKT header and data separately */
-			DEBUGP("nf_ct_h323: separate TPKT header indicates "
-			       "there will be TPKT data of %hu bytes\n",
-			       tpktlen - 4);
+			pr_debug("nf_ct_h323: separate TPKT header indicates "
+				 "there will be TPKT data of %hu bytes\n",
+				 tpktlen - 4);
 			info->tpkt_len[dir] = tpktlen - 4;
 			return 0;
 		}
@@ -308,9 +302,9 @@ static int expect_rtp_rtcp(struct sk_buff **pskb, struct nf_conn *ct,
 	} else {		/* Conntrack only */
 		if (nf_ct_expect_related(rtp_exp) == 0) {
 			if (nf_ct_expect_related(rtcp_exp) == 0) {
-				DEBUGP("nf_ct_h323: expect RTP ");
+				pr_debug("nf_ct_h323: expect RTP ");
 				NF_CT_DUMP_TUPLE(&rtp_exp->tuple);
-				DEBUGP("nf_ct_h323: expect RTCP ");
+				pr_debug("nf_ct_h323: expect RTCP ");
 				NF_CT_DUMP_TUPLE(&rtcp_exp->tuple);
 			} else {
 				nf_ct_unexpect_related(rtp_exp);
@@ -365,7 +359,7 @@ static int expect_t120(struct sk_buff **pskb,
 			       port, exp);
 	} else {		/* Conntrack only */
 		if (nf_ct_expect_related(exp) == 0) {
-			DEBUGP("nf_ct_h323: expect T.120 ");
+			pr_debug("nf_ct_h323: expect T.120 ");
 			NF_CT_DUMP_TUPLE(&exp->tuple);
 		} else
 			ret = -1;
@@ -413,7 +407,7 @@ static int process_olc(struct sk_buff **pskb, struct nf_conn *ct,
 {
 	int ret;
 
-	DEBUGP("nf_ct_h323: OpenLogicalChannel\n");
+	pr_debug("nf_ct_h323: OpenLogicalChannel\n");
 
 	if (olc->forwardLogicalChannelParameters.multiplexParameters.choice ==
 	    eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)
@@ -473,7 +467,7 @@ static int process_olca(struct sk_buff **pskb, struct nf_conn *ct,
 	H2250LogicalChannelAckParameters *ack;
 	int ret;
 
-	DEBUGP("nf_ct_h323: OpenLogicalChannelAck\n");
+	pr_debug("nf_ct_h323: OpenLogicalChannelAck\n");
 
 	if ((olca->options &
 	     eOpenLogicalChannelAck_reverseLogicalChannelParameters) &&
@@ -544,8 +538,8 @@ static int process_h245(struct sk_buff **pskb, struct nf_conn *ct,
 			return process_olc(pskb, ct, ctinfo, data, dataoff,
 					   &mscm->request.openLogicalChannel);
 		}
-		DEBUGP("nf_ct_h323: H.245 Request %d\n",
-		       mscm->request.choice);
+		pr_debug("nf_ct_h323: H.245 Request %d\n",
+			 mscm->request.choice);
 		break;
 	case eMultimediaSystemControlMessage_response:
 		if (mscm->response.choice ==
@@ -554,11 +548,11 @@ static int process_h245(struct sk_buff **pskb, struct nf_conn *ct,
 					    &mscm->response.
 					    openLogicalChannelAck);
 		}
-		DEBUGP("nf_ct_h323: H.245 Response %d\n",
-		       mscm->response.choice);
+		pr_debug("nf_ct_h323: H.245 Response %d\n",
+			 mscm->response.choice);
 		break;
 	default:
-		DEBUGP("nf_ct_h323: H.245 signal %d\n", mscm->choice);
+		pr_debug("nf_ct_h323: H.245 signal %d\n", mscm->choice);
 		break;
 	}
 
@@ -580,23 +574,23 @@ static int h245_help(struct sk_buff **pskb, unsigned int protoff,
 	    ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
 		return NF_ACCEPT;
 	}
-	DEBUGP("nf_ct_h245: skblen = %u\n", (*pskb)->len);
+	pr_debug("nf_ct_h245: skblen = %u\n", (*pskb)->len);
 
 	spin_lock_bh(&nf_h323_lock);
 
 	/* Process each TPKT */
 	while (get_tpkt_data(pskb, protoff, ct, ctinfo,
 			     &data, &datalen, &dataoff)) {
-		DEBUGP("nf_ct_h245: TPKT len=%d ", datalen);
+		pr_debug("nf_ct_h245: TPKT len=%d ", datalen);
 		NF_CT_DUMP_TUPLE(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple);
 
 		/* Decode H.245 signal */
 		ret = DecodeMultimediaSystemControlMessage(data, datalen,
 							   &mscm);
 		if (ret < 0) {
-			DEBUGP("nf_ct_h245: decoding error: %s\n",
-			       ret == H323_ERROR_BOUND ?
-			       "out of bound" : "out of range");
+			pr_debug("nf_ct_h245: decoding error: %s\n",
+				 ret == H323_ERROR_BOUND ?
+				 "out of bound" : "out of range");
 			/* We don't drop when decoding error */
 			break;
 		}
@@ -697,7 +691,7 @@ static int expect_h245(struct sk_buff **pskb, struct nf_conn *ct,
 			       port, exp);
 	} else {		/* Conntrack only */
 		if (nf_ct_expect_related(exp) == 0) {
-			DEBUGP("nf_ct_q931: expect H.245 ");
+			pr_debug("nf_ct_q931: expect H.245 ");
 			NF_CT_DUMP_TUPLE(&exp->tuple);
 		} else
 			ret = -1;
@@ -786,7 +780,7 @@ static int expect_callforwarding(struct sk_buff **pskb,
 	if (callforward_filter &&
 	    callforward_do_filter(&addr, &ct->tuplehash[!dir].tuple.src.u3,
 				  ct->tuplehash[!dir].tuple.src.l3num)) {
-		DEBUGP("nf_ct_q931: Call Forwarding not tracked\n");
+		pr_debug("nf_ct_q931: Call Forwarding not tracked\n");
 		return 0;
 	}
 
@@ -808,7 +802,7 @@ static int expect_callforwarding(struct sk_buff **pskb,
 					 taddr, port, exp);
 	} else {		/* Conntrack only */
 		if (nf_ct_expect_related(exp) == 0) {
-			DEBUGP("nf_ct_q931: expect Call Forwarding ");
+			pr_debug("nf_ct_q931: expect Call Forwarding ");
 			NF_CT_DUMP_TUPLE(&exp->tuple);
 		} else
 			ret = -1;
@@ -832,7 +826,7 @@ static int process_setup(struct sk_buff **pskb, struct nf_conn *ct,
 	union nf_conntrack_address addr;
 	typeof(set_h225_addr_hook) set_h225_addr;
 
-	DEBUGP("nf_ct_q931: Setup\n");
+	pr_debug("nf_ct_q931: Setup\n");
 
 	if (setup->options & eSetup_UUIE_h245Address) {
 		ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
@@ -847,11 +841,11 @@ static int process_setup(struct sk_buff **pskb, struct nf_conn *ct,
 	    get_h225_addr(ct, *data, &setup->destCallSignalAddress,
 			  &addr, &port) &&
 	    memcmp(&addr, &ct->tuplehash[!dir].tuple.src.u3, sizeof(addr))) {
-		DEBUGP("nf_ct_q931: set destCallSignalAddress "
-		       NIP6_FMT ":%hu->" NIP6_FMT ":%hu\n",
-		       NIP6(*(struct in6_addr *)&addr), ntohs(port),
-		       NIP6(*(struct in6_addr *)&ct->tuplehash[!dir].tuple.src.u3),
-		       ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port));
+		pr_debug("nf_ct_q931: set destCallSignalAddress "
+			 NIP6_FMT ":%hu->" NIP6_FMT ":%hu\n",
+			 NIP6(*(struct in6_addr *)&addr), ntohs(port),
+			 NIP6(*(struct in6_addr *)&ct->tuplehash[!dir].tuple.src.u3),
+			 ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port));
 		ret = set_h225_addr(pskb, data, dataoff,
 				    &setup->destCallSignalAddress,
 				    &ct->tuplehash[!dir].tuple.src.u3,
@@ -865,11 +859,11 @@ static int process_setup(struct sk_buff **pskb, struct nf_conn *ct,
 	    get_h225_addr(ct, *data, &setup->sourceCallSignalAddress,
 			  &addr, &port) &&
 	    memcmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(addr))) {
-		DEBUGP("nf_ct_q931: set sourceCallSignalAddress "
-		       NIP6_FMT ":%hu->" NIP6_FMT ":%hu\n",
-		       NIP6(*(struct in6_addr *)&addr), ntohs(port),
-		       NIP6(*(struct in6_addr *)&ct->tuplehash[!dir].tuple.dst.u3),
-		       ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port));
+		pr_debug("nf_ct_q931: set sourceCallSignalAddress "
+			 NIP6_FMT ":%hu->" NIP6_FMT ":%hu\n",
+			 NIP6(*(struct in6_addr *)&addr), ntohs(port),
+			 NIP6(*(struct in6_addr *)&ct->tuplehash[!dir].tuple.dst.u3),
+			 ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port));
 		ret = set_h225_addr(pskb, data, dataoff,
 				    &setup->sourceCallSignalAddress,
 				    &ct->tuplehash[!dir].tuple.dst.u3,
@@ -900,7 +894,7 @@ static int process_callproceeding(struct sk_buff **pskb,
 	int ret;
 	int i;
 
-	DEBUGP("nf_ct_q931: CallProceeding\n");
+	pr_debug("nf_ct_q931: CallProceeding\n");
 
 	if (callproc->options & eCallProceeding_UUIE_h245Address) {
 		ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
@@ -930,7 +924,7 @@ static int process_connect(struct sk_buff **pskb, struct nf_conn *ct,
 	int ret;
 	int i;
 
-	DEBUGP("nf_ct_q931: Connect\n");
+	pr_debug("nf_ct_q931: Connect\n");
 
 	if (connect->options & eConnect_UUIE_h245Address) {
 		ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
@@ -960,7 +954,7 @@ static int process_alerting(struct sk_buff **pskb, struct nf_conn *ct,
 	int ret;
 	int i;
 
-	DEBUGP("nf_ct_q931: Alerting\n");
+	pr_debug("nf_ct_q931: Alerting\n");
 
 	if (alert->options & eAlerting_UUIE_h245Address) {
 		ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
@@ -990,7 +984,7 @@ static int process_facility(struct sk_buff **pskb, struct nf_conn *ct,
 	int ret;
 	int i;
 
-	DEBUGP("nf_ct_q931: Facility\n");
+	pr_debug("nf_ct_q931: Facility\n");
 
 	if (facility->reason.choice == eFacilityReason_callForwarded) {
 		if (facility->options & eFacility_UUIE_alternativeAddress)
@@ -1029,7 +1023,7 @@ static int process_progress(struct sk_buff **pskb, struct nf_conn *ct,
 	int ret;
 	int i;
 
-	DEBUGP("nf_ct_q931: Progress\n");
+	pr_debug("nf_ct_q931: Progress\n");
 
 	if (progress->options & eProgress_UUIE_h245Address) {
 		ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
@@ -1086,8 +1080,8 @@ static int process_q931(struct sk_buff **pskb, struct nf_conn *ct,
 				       &pdu->h323_message_body.progress);
 		break;
 	default:
-		DEBUGP("nf_ct_q931: Q.931 signal %d\n",
-		       pdu->h323_message_body.choice);
+		pr_debug("nf_ct_q931: Q.931 signal %d\n",
+			 pdu->h323_message_body.choice);
 		break;
 	}
 
@@ -1121,22 +1115,22 @@ static int q931_help(struct sk_buff **pskb, unsigned int protoff,
 	    ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
 		return NF_ACCEPT;
 	}
-	DEBUGP("nf_ct_q931: skblen = %u\n", (*pskb)->len);
+	pr_debug("nf_ct_q931: skblen = %u\n", (*pskb)->len);
 
 	spin_lock_bh(&nf_h323_lock);
 
 	/* Process each TPKT */
 	while (get_tpkt_data(pskb, protoff, ct, ctinfo,
 			     &data, &datalen, &dataoff)) {
-		DEBUGP("nf_ct_q931: TPKT len=%d ", datalen);
+		pr_debug("nf_ct_q931: TPKT len=%d ", datalen);
 		NF_CT_DUMP_TUPLE(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple);
 
 		/* Decode Q.931 signal */
 		ret = DecodeQ931(data, datalen, &q931);
 		if (ret < 0) {
-			DEBUGP("nf_ct_q931: decoding error: %s\n",
-			       ret == H323_ERROR_BOUND ?
-			       "out of bound" : "out of range");
+			pr_debug("nf_ct_q931: decoding error: %s\n",
+				 ret == H323_ERROR_BOUND ?
+				 "out of bound" : "out of range");
 			/* We don't drop when decoding error */
 			break;
 		}
@@ -1274,7 +1268,7 @@ static int expect_q931(struct sk_buff **pskb, struct nf_conn *ct,
 		ret = nat_q931(pskb, ct, ctinfo, data, taddr, i, port, exp);
 	} else {		/* Conntrack only */
 		if (nf_ct_expect_related(exp) == 0) {
-			DEBUGP("nf_ct_ras: expect Q.931 ");
+			pr_debug("nf_ct_ras: expect Q.931 ");
 			NF_CT_DUMP_TUPLE(&exp->tuple);
 
 			/* Save port for looking up expect in processing RCF */
@@ -1295,7 +1289,7 @@ static int process_grq(struct sk_buff **pskb, struct nf_conn *ct,
 {
 	typeof(set_ras_addr_hook) set_ras_addr;
 
-	DEBUGP("nf_ct_ras: GRQ\n");
+	pr_debug("nf_ct_ras: GRQ\n");
 
 	set_ras_addr = rcu_dereference(set_ras_addr_hook);
 	if (set_ras_addr && ct->status & IPS_NAT_MASK)	/* NATed */
@@ -1315,7 +1309,7 @@ static int process_gcf(struct sk_buff **pskb, struct nf_conn *ct,
 	union nf_conntrack_address addr;
 	struct nf_conntrack_expect *exp;
 
-	DEBUGP("nf_ct_ras: GCF\n");
+	pr_debug("nf_ct_ras: GCF\n");
 
 	if (!get_h225_addr(ct, *data, &gcf->rasAddress, &addr, &port))
 		return 0;
@@ -1338,7 +1332,7 @@ static int process_gcf(struct sk_buff **pskb, struct nf_conn *ct,
 	exp->helper = nf_conntrack_helper_ras;
 
 	if (nf_ct_expect_related(exp) == 0) {
-		DEBUGP("nf_ct_ras: expect RAS ");
+		pr_debug("nf_ct_ras: expect RAS ");
 		NF_CT_DUMP_TUPLE(&exp->tuple);
 	} else
 		ret = -1;
@@ -1357,7 +1351,7 @@ static int process_rrq(struct sk_buff **pskb, struct nf_conn *ct,
 	int ret;
 	typeof(set_ras_addr_hook) set_ras_addr;
 
-	DEBUGP("nf_ct_ras: RRQ\n");
+	pr_debug("nf_ct_ras: RRQ\n");
 
 	ret = expect_q931(pskb, ct, ctinfo, data,
 			  rrq->callSignalAddress.item,
@@ -1375,7 +1369,7 @@ static int process_rrq(struct sk_buff **pskb, struct nf_conn *ct,
 	}
 
 	if (rrq->options & eRegistrationRequest_timeToLive) {
-		DEBUGP("nf_ct_ras: RRQ TTL = %u seconds\n", rrq->timeToLive);
+		pr_debug("nf_ct_ras: RRQ TTL = %u seconds\n", rrq->timeToLive);
 		info->timeout = rrq->timeToLive;
 	} else
 		info->timeout = default_rrq_ttl;
@@ -1394,7 +1388,7 @@ static int process_rcf(struct sk_buff **pskb, struct nf_conn *ct,
 	struct nf_conntrack_expect *exp;
 	typeof(set_sig_addr_hook) set_sig_addr;
 
-	DEBUGP("nf_ct_ras: RCF\n");
+	pr_debug("nf_ct_ras: RCF\n");
 
 	set_sig_addr = rcu_dereference(set_sig_addr_hook);
 	if (set_sig_addr && ct->status & IPS_NAT_MASK) {
@@ -1406,14 +1400,13 @@ static int process_rcf(struct sk_buff **pskb, struct nf_conn *ct,
 	}
 
 	if (rcf->options & eRegistrationConfirm_timeToLive) {
-		DEBUGP("nf_ct_ras: RCF TTL = %u seconds\n", rcf->timeToLive);
+		pr_debug("nf_ct_ras: RCF TTL = %u seconds\n", rcf->timeToLive);
 		info->timeout = rcf->timeToLive;
 	}
 
 	if (info->timeout > 0) {
-		DEBUGP
-		    ("nf_ct_ras: set RAS connection timeout to %u seconds\n",
-		     info->timeout);
+		pr_debug("nf_ct_ras: set RAS connection timeout to "
+			 "%u seconds\n", info->timeout);
 		nf_ct_refresh(ct, *pskb, info->timeout * HZ);
 
 		/* Set expect timeout */
@@ -1421,9 +1414,9 @@ static int process_rcf(struct sk_buff **pskb, struct nf_conn *ct,
 		exp = find_expect(ct, &ct->tuplehash[dir].tuple.dst.u3,
 				  info->sig_port[!dir]);
 		if (exp) {
-			DEBUGP("nf_ct_ras: set Q.931 expect "
-			       "timeout to %u seconds for",
-			       info->timeout);
+			pr_debug("nf_ct_ras: set Q.931 expect "
+				 "timeout to %u seconds for",
+				 info->timeout);
 			NF_CT_DUMP_TUPLE(&exp->tuple);
 			set_expect_timeout(exp, info->timeout);
 		}
@@ -1443,7 +1436,7 @@ static int process_urq(struct sk_buff **pskb, struct nf_conn *ct,
 	int ret;
 	typeof(set_sig_addr_hook) set_sig_addr;
 
-	DEBUGP("nf_ct_ras: URQ\n");
+	pr_debug("nf_ct_ras: URQ\n");
 
 	set_sig_addr = rcu_dereference(set_sig_addr_hook);
 	if (set_sig_addr && ct->status & IPS_NAT_MASK) {
@@ -1476,7 +1469,7 @@ static int process_arq(struct sk_buff **pskb, struct nf_conn *ct,
 	union nf_conntrack_address addr;
 	typeof(set_h225_addr_hook) set_h225_addr;
 
-	DEBUGP("nf_ct_ras: ARQ\n");
+	pr_debug("nf_ct_ras: ARQ\n");
 
 	set_h225_addr = rcu_dereference(set_h225_addr_hook);
 	if ((arq->options & eAdmissionRequest_destCallSignalAddress) &&
@@ -1519,7 +1512,7 @@ static int process_acf(struct sk_buff **pskb, struct nf_conn *ct,
 	struct nf_conntrack_expect *exp;
 	typeof(set_sig_addr_hook) set_sig_addr;
 
-	DEBUGP("nf_ct_ras: ACF\n");
+	pr_debug("nf_ct_ras: ACF\n");
 
 	if (!get_h225_addr(ct, *data, &acf->destCallSignalAddress,
 			   &addr, &port))
@@ -1544,7 +1537,7 @@ static int process_acf(struct sk_buff **pskb, struct nf_conn *ct,
 	exp->helper = nf_conntrack_helper_q931;
 
 	if (nf_ct_expect_related(exp) == 0) {
-		DEBUGP("nf_ct_ras: expect Q.931 ");
+		pr_debug("nf_ct_ras: expect Q.931 ");
 		NF_CT_DUMP_TUPLE(&exp->tuple);
 	} else
 		ret = -1;
@@ -1561,7 +1554,7 @@ static int process_lrq(struct sk_buff **pskb, struct nf_conn *ct,
 {
 	typeof(set_ras_addr_hook) set_ras_addr;
 
-	DEBUGP("nf_ct_ras: LRQ\n");
+	pr_debug("nf_ct_ras: LRQ\n");
 
 	set_ras_addr = rcu_dereference(set_ras_addr_hook);
 	if (set_ras_addr && ct->status & IPS_NAT_MASK)
@@ -1581,7 +1574,7 @@ static int process_lcf(struct sk_buff **pskb, struct nf_conn *ct,
 	union nf_conntrack_address addr;
 	struct nf_conntrack_expect *exp;
 
-	DEBUGP("nf_ct_ras: LCF\n");
+	pr_debug("nf_ct_ras: LCF\n");
 
 	if (!get_h225_addr(ct, *data, &lcf->callSignalAddress,
 			   &addr, &port))
@@ -1597,7 +1590,7 @@ static int process_lcf(struct sk_buff **pskb, struct nf_conn *ct,
 	exp->helper = nf_conntrack_helper_q931;
 
 	if (nf_ct_expect_related(exp) == 0) {
-		DEBUGP("nf_ct_ras: expect Q.931 ");
+		pr_debug("nf_ct_ras: expect Q.931 ");
 		NF_CT_DUMP_TUPLE(&exp->tuple);
 	} else
 		ret = -1;
@@ -1618,7 +1611,7 @@ static int process_irr(struct sk_buff **pskb, struct nf_conn *ct,
 	typeof(set_ras_addr_hook) set_ras_addr;
 	typeof(set_sig_addr_hook) set_sig_addr;
 
-	DEBUGP("nf_ct_ras: IRR\n");
+	pr_debug("nf_ct_ras: IRR\n");
 
 	set_ras_addr = rcu_dereference(set_ras_addr_hook);
 	if (set_ras_addr && ct->status & IPS_NAT_MASK) {
@@ -1677,7 +1670,7 @@ static int process_ras(struct sk_buff **pskb, struct nf_conn *ct,
 		return process_irr(pskb, ct, ctinfo, data,
 				   &ras->infoRequestResponse);
 	default:
-		DEBUGP("nf_ct_ras: RAS message %d\n", ras->choice);
+		pr_debug("nf_ct_ras: RAS message %d\n", ras->choice);
 		break;
 	}
 
@@ -1693,7 +1686,7 @@ static int ras_help(struct sk_buff **pskb, unsigned int protoff,
 	int datalen = 0;
 	int ret;
 
-	DEBUGP("nf_ct_ras: skblen = %u\n", (*pskb)->len);
+	pr_debug("nf_ct_ras: skblen = %u\n", (*pskb)->len);
 
 	spin_lock_bh(&nf_h323_lock);
 
@@ -1701,15 +1694,15 @@ static int ras_help(struct sk_buff **pskb, unsigned int protoff,
 	data = get_udp_data(pskb, protoff, &datalen);
 	if (data == NULL)
 		goto accept;
-	DEBUGP("nf_ct_ras: RAS message len=%d ", datalen);
+	pr_debug("nf_ct_ras: RAS message len=%d ", datalen);
 	NF_CT_DUMP_TUPLE(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple);
 
 	/* Decode RAS message */
 	ret = DecodeRasMessage(data, datalen, &ras);
 	if (ret < 0) {
-		DEBUGP("nf_ct_ras: decoding error: %s\n",
-		       ret == H323_ERROR_BOUND ?
-		       "out of bound" : "out of range");
+		pr_debug("nf_ct_ras: decoding error: %s\n",
+			 ret == H323_ERROR_BOUND ?
+			 "out of bound" : "out of range");
 		goto accept;
 	}
 
@@ -1760,7 +1753,7 @@ static void __exit nf_conntrack_h323_fini(void)
 	nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]);
 	nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]);
 	kfree(h323_buffer);
-	DEBUGP("nf_ct_h323: fini\n");
+	pr_debug("nf_ct_h323: fini\n");
 }
 
 /****************************************************************************/
@@ -1783,7 +1776,7 @@ static int __init nf_conntrack_h323_init(void)
 	ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[1]);
 	if (ret < 0)
 		goto err4;
-	DEBUGP("nf_ct_h323: init success\n");
+	pr_debug("nf_ct_h323: init success\n");
 	return 0;
 
 err4:
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index 8c7340794bf6..1562ca97a349 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -12,6 +12,7 @@
 #include <linux/moduleparam.h>
 #include <linux/skbuff.h>
 #include <linux/in.h>
+#include <linux/ip.h>
 #include <linux/tcp.h>
 #include <linux/netfilter.h>
 
@@ -55,13 +56,6 @@ static const char *dccprotos[] = {
 
 #define MINMATCHLEN	5
 
-#if 0
-#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s:" format, \
-				       __FILE__, __FUNCTION__ , ## args)
-#else
-#define DEBUGP(format, args...)
-#endif
-
 /* tries to get the ip_addr and port out of a dcc command
  * return value: -1 on failure, 0 on success
  *	data		pointer to first byte of DCC command data
@@ -99,6 +93,7 @@ static int help(struct sk_buff **pskb, unsigned int protoff,
 		struct nf_conn *ct, enum ip_conntrack_info ctinfo)
 {
 	unsigned int dataoff;
+	struct iphdr *iph;
 	struct tcphdr _tcph, *th;
 	char *data, *data_limit, *ib_ptr;
 	int dir = CTINFO2DIR(ctinfo);
@@ -148,9 +143,10 @@ static int help(struct sk_buff **pskb, unsigned int protoff,
 		data += 5;
 		/* we have at least (19+MINMATCHLEN)-5 bytes valid data left */
 
-		DEBUGP("DCC found in master %u.%u.%u.%u:%u %u.%u.%u.%u:%u...\n",
-			NIPQUAD(iph->saddr), ntohs(th->source),
-			NIPQUAD(iph->daddr), ntohs(th->dest));
+		iph = ip_hdr(*pskb);
+		pr_debug("DCC found in master %u.%u.%u.%u:%u %u.%u.%u.%u:%u\n",
+			 NIPQUAD(iph->saddr), ntohs(th->source),
+			 NIPQUAD(iph->daddr), ntohs(th->dest));
 
 		for (i = 0; i < ARRAY_SIZE(dccprotos); i++) {
 			if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) {
@@ -158,18 +154,18 @@ static int help(struct sk_buff **pskb, unsigned int protoff,
 				continue;
 			}
 			data += strlen(dccprotos[i]);
-			DEBUGP("DCC %s detected\n", dccprotos[i]);
+			pr_debug("DCC %s detected\n", dccprotos[i]);
 
 			/* we have at least
 			 * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid
 			 * data left (== 14/13 bytes) */
 			if (parse_dcc((char *)data, data_limit, &dcc_ip,
 				       &dcc_port, &addr_beg_p, &addr_end_p)) {
-				DEBUGP("unable to parse dcc command\n");
+				pr_debug("unable to parse dcc command\n");
 				continue;
 			}
-			DEBUGP("DCC bound ip/port: %u.%u.%u.%u:%u\n",
-				HIPQUAD(dcc_ip), dcc_port);
+			pr_debug("DCC bound ip/port: %u.%u.%u.%u:%u\n",
+				 HIPQUAD(dcc_ip), dcc_port);
 
 			/* dcc_ip can be the internal OR external (NAT'ed) IP */
 			tuple = &ct->tuplehash[dir].tuple;
diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c
index 2fd0f11b8fb2..b1bfa207a850 100644
--- a/net/netfilter/nf_conntrack_l3proto_generic.c
+++ b/net/netfilter/nf_conntrack_l3proto_generic.c
@@ -31,12 +31,6 @@
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 static int generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
 				struct nf_conntrack_tuple *tuple)
 {
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 63dac5eb959f..b0804199ab59 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -65,7 +65,7 @@ void
 			     struct nf_conntrack_expect *exp) __read_mostly;
 EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn);
 
-#if 0
+#ifdef DEBUG
 /* PptpControlMessageType names */
 const char *pptp_msg_name[] = {
 	"UNKNOWN_MESSAGE",
@@ -86,9 +86,6 @@ const char *pptp_msg_name[] = {
 	"SET_LINK_INFO"
 };
 EXPORT_SYMBOL(pptp_msg_name);
-#define DEBUGP(format, args...)	printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args)
-#else
-#define DEBUGP(format, args...)
 #endif
 
 #define SECS *HZ
@@ -102,7 +99,7 @@ static void pptp_expectfn(struct nf_conn *ct,
 			 struct nf_conntrack_expect *exp)
 {
 	typeof(nf_nat_pptp_hook_expectfn) nf_nat_pptp_expectfn;
-	DEBUGP("increasing timeouts\n");
+	pr_debug("increasing timeouts\n");
 
 	/* increase timeout of GRE data channel conntrack entry */
 	ct->proto.gre.timeout	     = PPTP_GRE_TIMEOUT;
@@ -121,17 +118,17 @@ static void pptp_expectfn(struct nf_conn *ct,
 
 		/* obviously this tuple inversion only works until you do NAT */
 		nf_ct_invert_tuplepr(&inv_t, &exp->tuple);
-		DEBUGP("trying to unexpect other dir: ");
+		pr_debug("trying to unexpect other dir: ");
 		NF_CT_DUMP_TUPLE(&inv_t);
 
 		exp_other = nf_ct_expect_find_get(&inv_t);
 		if (exp_other) {
 			/* delete other expectation.  */
-			DEBUGP("found\n");
+			pr_debug("found\n");
 			nf_ct_unexpect_related(exp_other);
 			nf_ct_expect_put(exp_other);
 		} else {
-			DEBUGP("not found\n");
+			pr_debug("not found\n");
 		}
 	}
 	rcu_read_unlock();
@@ -143,13 +140,13 @@ static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t)
 	struct nf_conntrack_expect *exp;
 	struct nf_conn *sibling;
 
-	DEBUGP("trying to timeout ct or exp for tuple ");
+	pr_debug("trying to timeout ct or exp for tuple ");
 	NF_CT_DUMP_TUPLE(t);
 
 	h = nf_conntrack_find_get(t);
 	if (h)  {
 		sibling = nf_ct_tuplehash_to_ctrack(h);
-		DEBUGP("setting timeout of conntrack %p to 0\n", sibling);
+		pr_debug("setting timeout of conntrack %p to 0\n", sibling);
 		sibling->proto.gre.timeout	  = 0;
 		sibling->proto.gre.stream_timeout = 0;
 		if (del_timer(&sibling->timeout))
@@ -159,7 +156,7 @@ static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t)
 	} else {
 		exp = nf_ct_expect_find_get(t);
 		if (exp) {
-			DEBUGP("unexpect_related of expect %p\n", exp);
+			pr_debug("unexpect_related of expect %p\n", exp);
 			nf_ct_unexpect_related(exp);
 			nf_ct_expect_put(exp);
 			return 1;
@@ -182,7 +179,7 @@ static void pptp_destroy_siblings(struct nf_conn *ct)
 	t.src.u.gre.key = help->help.ct_pptp_info.pns_call_id;
 	t.dst.u.gre.key = help->help.ct_pptp_info.pac_call_id;
 	if (!destroy_sibling_or_exp(&t))
-		DEBUGP("failed to timeout original pns->pac ct/exp\n");
+		pr_debug("failed to timeout original pns->pac ct/exp\n");
 
 	/* try reply (pac->pns) tuple */
 	memcpy(&t, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, sizeof(t));
@@ -190,7 +187,7 @@ static void pptp_destroy_siblings(struct nf_conn *ct)
 	t.src.u.gre.key = help->help.ct_pptp_info.pac_call_id;
 	t.dst.u.gre.key = help->help.ct_pptp_info.pns_call_id;
 	if (!destroy_sibling_or_exp(&t))
-		DEBUGP("failed to timeout reply pac->pns ct/exp\n");
+		pr_debug("failed to timeout reply pac->pns ct/exp\n");
 }
 
 /* expect GRE connections (PNS->PAC and PAC->PNS direction) */
@@ -270,7 +267,7 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 	typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound;
 
 	msg = ntohs(ctlh->messageType);
-	DEBUGP("inbound control message %s\n", pptp_msg_name[msg]);
+	pr_debug("inbound control message %s\n", pptp_msg_name[msg]);
 
 	switch (msg) {
 	case PPTP_START_SESSION_REPLY:
@@ -305,8 +302,8 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 		pcid = pptpReq->ocack.peersCallID;
 		if (info->pns_call_id != pcid)
 			goto invalid;
-		DEBUGP("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg],
-			ntohs(cid), ntohs(pcid));
+		pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg],
+			 ntohs(cid), ntohs(pcid));
 
 		if (pptpReq->ocack.resultCode == PPTP_OUTCALL_CONNECT) {
 			info->cstate = PPTP_CALL_OUT_CONF;
@@ -322,7 +319,7 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 			goto invalid;
 
 		cid = pptpReq->icreq.callID;
-		DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
+		pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
 		info->cstate = PPTP_CALL_IN_REQ;
 		info->pac_call_id = cid;
 		break;
@@ -341,7 +338,7 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 		if (info->pns_call_id != pcid)
 			goto invalid;
 
-		DEBUGP("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid));
+		pr_debug("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid));
 		info->cstate = PPTP_CALL_IN_CONF;
 
 		/* we expect a GRE connection from PAC to PNS */
@@ -351,7 +348,7 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 	case PPTP_CALL_DISCONNECT_NOTIFY:
 		/* server confirms disconnect */
 		cid = pptpReq->disc.callID;
-		DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
+		pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
 		info->cstate = PPTP_CALL_NONE;
 
 		/* untrack this call id, unexpect GRE packets */
@@ -374,11 +371,11 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 	return NF_ACCEPT;
 
 invalid:
-	DEBUGP("invalid %s: type=%d cid=%u pcid=%u "
-	       "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n",
-	       msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0],
-	       msg, ntohs(cid), ntohs(pcid),  info->cstate, info->sstate,
-	       ntohs(info->pns_call_id), ntohs(info->pac_call_id));
+	pr_debug("invalid %s: type=%d cid=%u pcid=%u "
+		 "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n",
+		 msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0],
+		 msg, ntohs(cid), ntohs(pcid),  info->cstate, info->sstate,
+		 ntohs(info->pns_call_id), ntohs(info->pac_call_id));
 	return NF_ACCEPT;
 }
 
@@ -396,7 +393,7 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 	typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound;
 
 	msg = ntohs(ctlh->messageType);
-	DEBUGP("outbound control message %s\n", pptp_msg_name[msg]);
+	pr_debug("outbound control message %s\n", pptp_msg_name[msg]);
 
 	switch (msg) {
 	case PPTP_START_SESSION_REQUEST:
@@ -418,7 +415,7 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 		info->cstate = PPTP_CALL_OUT_REQ;
 		/* track PNS call id */
 		cid = pptpReq->ocreq.callID;
-		DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
+		pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
 		info->pns_call_id = cid;
 		break;
 
@@ -432,8 +429,8 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 		pcid = pptpReq->icack.peersCallID;
 		if (info->pac_call_id != pcid)
 			goto invalid;
-		DEBUGP("%s, CID=%X PCID=%X\n", pptp_msg_name[msg],
-		       ntohs(cid), ntohs(pcid));
+		pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name[msg],
+			 ntohs(cid), ntohs(pcid));
 
 		if (pptpReq->icack.resultCode == PPTP_INCALL_ACCEPT) {
 			/* part two of the three-way handshake */
@@ -469,11 +466,11 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 	return NF_ACCEPT;
 
 invalid:
-	DEBUGP("invalid %s: type=%d cid=%u pcid=%u "
-	       "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n",
-	       msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0],
-	       msg, ntohs(cid), ntohs(pcid),  info->cstate, info->sstate,
-	       ntohs(info->pns_call_id), ntohs(info->pac_call_id));
+	pr_debug("invalid %s: type=%d cid=%u pcid=%u "
+		 "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n",
+		 msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0],
+		 msg, ntohs(cid), ntohs(pcid),  info->cstate, info->sstate,
+		 ntohs(info->pns_call_id), ntohs(info->pac_call_id));
 	return NF_ACCEPT;
 }
 
@@ -524,7 +521,7 @@ conntrack_pptp_help(struct sk_buff **pskb, unsigned int protoff,
 
 	pptph = skb_header_pointer(*pskb, nexthdr_off, sizeof(_pptph), &_pptph);
 	if (!pptph) {
-		DEBUGP("no full PPTP header, can't track\n");
+		pr_debug("no full PPTP header, can't track\n");
 		return NF_ACCEPT;
 	}
 	nexthdr_off += sizeof(_pptph);
@@ -533,7 +530,7 @@ conntrack_pptp_help(struct sk_buff **pskb, unsigned int protoff,
 	/* if it's not a control message we can't do anything with it */
 	if (ntohs(pptph->packetType) != PPTP_PACKET_CONTROL ||
 	    ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) {
-		DEBUGP("not a control packet\n");
+		pr_debug("not a control packet\n");
 		return NF_ACCEPT;
 	}
 
@@ -569,8 +566,8 @@ conntrack_pptp_help(struct sk_buff **pskb, unsigned int protoff,
 		/* server -> client (PAC -> PNS) */
 		ret = pptp_inbound_pkt(pskb, ctlh, pptpReq, reqlen, ct,
 				       ctinfo);
-	DEBUGP("sstate: %d->%d, cstate: %d->%d\n",
-		oldsstate, info->sstate, oldcstate, info->cstate);
+	pr_debug("sstate: %d->%d, cstate: %d->%d\n",
+		 oldsstate, info->sstate, oldcstate, info->cstate);
 	spin_unlock_bh(&nf_pptp_lock);
 
 	return ret;
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 339c397d1b5f..771c4c29936e 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -40,12 +40,6 @@
 #define GRE_TIMEOUT		(30 * HZ)
 #define GRE_STREAM_TIMEOUT	(180 * HZ)
 
-#if 0
-#define DEBUGP(format, args...)	printk(KERN_DEBUG "%s:%s: " format, __FILE__, __FUNCTION__, ## args)
-#else
-#define DEBUGP(x, args...)
-#endif
-
 static DEFINE_RWLOCK(nf_ct_gre_lock);
 static LIST_HEAD(gre_keymap_list);
 
@@ -87,7 +81,7 @@ static __be16 gre_keymap_lookup(struct nf_conntrack_tuple *t)
 	}
 	read_unlock_bh(&nf_ct_gre_lock);
 
-	DEBUGP("lookup src key 0x%x for ", key);
+	pr_debug("lookup src key 0x%x for ", key);
 	NF_CT_DUMP_TUPLE(t);
 
 	return key;
@@ -107,8 +101,8 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir,
 			if (gre_key_cmpfn(km, t) && km == *kmp)
 				return 0;
 		}
-		DEBUGP("trying to override keymap_%s for ct %p\n",
-			dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct);
+		pr_debug("trying to override keymap_%s for ct %p\n",
+			 dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct);
 		return -EEXIST;
 	}
 
@@ -118,7 +112,7 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir,
 	memcpy(&km->tuple, t, sizeof(*t));
 	*kmp = km;
 
-	DEBUGP("adding new entry %p: ", km);
+	pr_debug("adding new entry %p: ", km);
 	NF_CT_DUMP_TUPLE(&km->tuple);
 
 	write_lock_bh(&nf_ct_gre_lock);
@@ -135,13 +129,13 @@ void nf_ct_gre_keymap_destroy(struct nf_conn *ct)
 	struct nf_conn_help *help = nfct_help(ct);
 	enum ip_conntrack_dir dir;
 
-	DEBUGP("entering for ct %p\n", ct);
+	pr_debug("entering for ct %p\n", ct);
 
 	write_lock_bh(&nf_ct_gre_lock);
 	for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) {
 		if (help->help.ct_pptp_info.keymap[dir]) {
-			DEBUGP("removing %p from list\n",
-				help->help.ct_pptp_info.keymap[dir]);
+			pr_debug("removing %p from list\n",
+				 help->help.ct_pptp_info.keymap[dir]);
 			list_del(&help->help.ct_pptp_info.keymap[dir]->list);
 			kfree(help->help.ct_pptp_info.keymap[dir]);
 			help->help.ct_pptp_info.keymap[dir] = NULL;
@@ -186,7 +180,7 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb,
 		return 1;
 
 	if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) {
-		DEBUGP("GRE_VERSION_PPTP but unknown proto\n");
+		pr_debug("GRE_VERSION_PPTP but unknown proto\n");
 		return 0;
 	}
 
@@ -242,7 +236,7 @@ static int gre_packet(struct nf_conn *ct,
 static int gre_new(struct nf_conn *ct, const struct sk_buff *skb,
 		   unsigned int dataoff)
 {
-	DEBUGP(": ");
+	pr_debug(": ");
 	NF_CT_DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 
 	/* initialize to sane value.  Ideally a conntrack helper
@@ -258,10 +252,10 @@ static int gre_new(struct nf_conn *ct, const struct sk_buff *skb,
 static void gre_destroy(struct nf_conn *ct)
 {
 	struct nf_conn *master = ct->master;
-	DEBUGP(" entering\n");
+	pr_debug(" entering\n");
 
 	if (!master)
-		DEBUGP("no master !?!\n");
+		pr_debug("no master !?!\n");
 	else
 		nf_ct_gre_keymap_destroy(master);
 }
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 0d3254b974c5..265769e5002b 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -25,12 +25,6 @@
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
 
-#if 0
-#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__)
-#else
-#define DEBUGP(format, args...)
-#endif
-
 /* Protects conntrack->proto.sctp */
 static DEFINE_RWLOCK(sctp_lock);
 
@@ -151,9 +145,6 @@ static int sctp_pkt_to_tuple(const struct sk_buff *skb,
 {
 	sctp_sctphdr_t _hdr, *hp;
 
-	DEBUGP(__FUNCTION__);
-	DEBUGP("\n");
-
 	/* Actually only need first 8 bytes. */
 	hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
 	if (hp == NULL)
@@ -167,9 +158,6 @@ static int sctp_pkt_to_tuple(const struct sk_buff *skb,
 static int sctp_invert_tuple(struct nf_conntrack_tuple *tuple,
 			     const struct nf_conntrack_tuple *orig)
 {
-	DEBUGP(__FUNCTION__);
-	DEBUGP("\n");
-
 	tuple->src.u.sctp.port = orig->dst.u.sctp.port;
 	tuple->dst.u.sctp.port = orig->src.u.sctp.port;
 	return 1;
@@ -179,9 +167,6 @@ static int sctp_invert_tuple(struct nf_conntrack_tuple *tuple,
 static int sctp_print_tuple(struct seq_file *s,
 			    const struct nf_conntrack_tuple *tuple)
 {
-	DEBUGP(__FUNCTION__);
-	DEBUGP("\n");
-
 	return seq_printf(s, "sport=%hu dport=%hu ",
 			  ntohs(tuple->src.u.sctp.port),
 			  ntohs(tuple->dst.u.sctp.port));
@@ -193,9 +178,6 @@ static int sctp_print_conntrack(struct seq_file *s,
 {
 	enum sctp_conntrack state;
 
-	DEBUGP(__FUNCTION__);
-	DEBUGP("\n");
-
 	read_lock_bh(&sctp_lock);
 	state = conntrack->proto.sctp.state;
 	read_unlock_bh(&sctp_lock);
@@ -219,13 +201,10 @@ static int do_basic_checks(struct nf_conn *conntrack,
 	sctp_chunkhdr_t _sch, *sch;
 	int flag;
 
-	DEBUGP(__FUNCTION__);
-	DEBUGP("\n");
-
 	flag = 0;
 
 	for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
-		DEBUGP("Chunk Num: %d  Type: %d\n", count, sch->type);
+		pr_debug("Chunk Num: %d  Type: %d\n", count, sch->type);
 
 		if (sch->type == SCTP_CID_INIT
 			|| sch->type == SCTP_CID_INIT_ACK
@@ -242,7 +221,7 @@ static int do_basic_checks(struct nf_conn *conntrack,
 			|| sch->type == SCTP_CID_COOKIE_ECHO
 			|| flag)
 		      && count !=0) || !sch->length) {
-			DEBUGP("Basic checks failed\n");
+			pr_debug("Basic checks failed\n");
 			return 1;
 		}
 
@@ -251,7 +230,7 @@ static int do_basic_checks(struct nf_conn *conntrack,
 		}
 	}
 
-	DEBUGP("Basic checks passed\n");
+	pr_debug("Basic checks passed\n");
 	return count == 0;
 }
 
@@ -261,50 +240,47 @@ static int new_state(enum ip_conntrack_dir dir,
 {
 	int i;
 
-	DEBUGP(__FUNCTION__);
-	DEBUGP("\n");
-
-	DEBUGP("Chunk type: %d\n", chunk_type);
+	pr_debug("Chunk type: %d\n", chunk_type);
 
 	switch (chunk_type) {
 		case SCTP_CID_INIT:
-			DEBUGP("SCTP_CID_INIT\n");
+			pr_debug("SCTP_CID_INIT\n");
 			i = 0; break;
 		case SCTP_CID_INIT_ACK:
-			DEBUGP("SCTP_CID_INIT_ACK\n");
+			pr_debug("SCTP_CID_INIT_ACK\n");
 			i = 1; break;
 		case SCTP_CID_ABORT:
-			DEBUGP("SCTP_CID_ABORT\n");
+			pr_debug("SCTP_CID_ABORT\n");
 			i = 2; break;
 		case SCTP_CID_SHUTDOWN:
-			DEBUGP("SCTP_CID_SHUTDOWN\n");
+			pr_debug("SCTP_CID_SHUTDOWN\n");
 			i = 3; break;
 		case SCTP_CID_SHUTDOWN_ACK:
-			DEBUGP("SCTP_CID_SHUTDOWN_ACK\n");
+			pr_debug("SCTP_CID_SHUTDOWN_ACK\n");
 			i = 4; break;
 		case SCTP_CID_ERROR:
-			DEBUGP("SCTP_CID_ERROR\n");
+			pr_debug("SCTP_CID_ERROR\n");
 			i = 5; break;
 		case SCTP_CID_COOKIE_ECHO:
-			DEBUGP("SCTP_CID_COOKIE_ECHO\n");
+			pr_debug("SCTP_CID_COOKIE_ECHO\n");
 			i = 6; break;
 		case SCTP_CID_COOKIE_ACK:
-			DEBUGP("SCTP_CID_COOKIE_ACK\n");
+			pr_debug("SCTP_CID_COOKIE_ACK\n");
 			i = 7; break;
 		case SCTP_CID_SHUTDOWN_COMPLETE:
-			DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n");
+			pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n");
 			i = 8; break;
 		default:
 			/* Other chunks like DATA, SACK, HEARTBEAT and
 			its ACK do not cause a change in state */
-			DEBUGP("Unknown chunk type, Will stay in %s\n",
-						sctp_conntrack_names[cur_state]);
+			pr_debug("Unknown chunk type, Will stay in %s\n",
+				 sctp_conntrack_names[cur_state]);
 			return cur_state;
 	}
 
-	DEBUGP("dir: %d   cur_state: %s  chunk_type: %d  new_state: %s\n",
-			dir, sctp_conntrack_names[cur_state], chunk_type,
-			sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]);
+	pr_debug("dir: %d   cur_state: %s  chunk_type: %d  new_state: %s\n",
+		 dir, sctp_conntrack_names[cur_state], chunk_type,
+		 sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]);
 
 	return sctp_conntracks[dir][i][cur_state];
 }
@@ -323,9 +299,6 @@ static int sctp_packet(struct nf_conn *conntrack,
 	u_int32_t offset, count;
 	char map[256 / sizeof (char)] = {0};
 
-	DEBUGP(__FUNCTION__);
-	DEBUGP("\n");
-
 	sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
 	if (sh == NULL)
 		return -1;
@@ -340,7 +313,7 @@ static int sctp_packet(struct nf_conn *conntrack,
 		&& !test_bit(SCTP_CID_ABORT, (void *)map)
 		&& !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map)
 		&& (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
-		DEBUGP("Verification tag check failed\n");
+		pr_debug("Verification tag check failed\n");
 		return -1;
 	}
 
@@ -385,8 +358,9 @@ static int sctp_packet(struct nf_conn *conntrack,
 
 		/* Invalid */
 		if (newconntrack == SCTP_CONNTRACK_MAX) {
-			DEBUGP("nf_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n",
-			       CTINFO2DIR(ctinfo), sch->type, oldsctpstate);
+			pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u "
+				 "conntrack=%u\n",
+				 CTINFO2DIR(ctinfo), sch->type, oldsctpstate);
 			write_unlock_bh(&sctp_lock);
 			return -1;
 		}
@@ -402,8 +376,8 @@ static int sctp_packet(struct nf_conn *conntrack,
 					write_unlock_bh(&sctp_lock);
 					return -1;
 			}
-			DEBUGP("Setting vtag %x for dir %d\n",
-					ih->init_tag, !CTINFO2DIR(ctinfo));
+			pr_debug("Setting vtag %x for dir %d\n",
+				 ih->init_tag, !CTINFO2DIR(ctinfo));
 			conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag;
 		}
 
@@ -418,7 +392,7 @@ static int sctp_packet(struct nf_conn *conntrack,
 	if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED
 		&& CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY
 		&& newconntrack == SCTP_CONNTRACK_ESTABLISHED) {
-		DEBUGP("Setting assured bit\n");
+		pr_debug("Setting assured bit\n");
 		set_bit(IPS_ASSURED_BIT, &conntrack->status);
 		nf_conntrack_event_cache(IPCT_STATUS, skb);
 	}
@@ -436,9 +410,6 @@ static int sctp_new(struct nf_conn *conntrack, const struct sk_buff *skb,
 	u_int32_t offset, count;
 	char map[256 / sizeof (char)] = {0};
 
-	DEBUGP(__FUNCTION__);
-	DEBUGP("\n");
-
 	sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
 	if (sh == NULL)
 		return 0;
@@ -461,7 +432,7 @@ static int sctp_new(struct nf_conn *conntrack, const struct sk_buff *skb,
 
 		/* Invalid: delete conntrack */
 		if (newconntrack == SCTP_CONNTRACK_MAX) {
-			DEBUGP("nf_conntrack_sctp: invalid new deleting.\n");
+			pr_debug("nf_conntrack_sctp: invalid new deleting.\n");
 			return 0;
 		}
 
@@ -475,8 +446,8 @@ static int sctp_new(struct nf_conn *conntrack, const struct sk_buff *skb,
 				if (ih == NULL)
 					return 0;
 
-				DEBUGP("Setting vtag %x for new conn\n",
-					ih->init_tag);
+				pr_debug("Setting vtag %x for new conn\n",
+					 ih->init_tag);
 
 				conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] =
 								ih->init_tag;
@@ -488,8 +459,8 @@ static int sctp_new(struct nf_conn *conntrack, const struct sk_buff *skb,
 		/* If it is a shutdown ack OOTB packet, we expect a return
 		   shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
 		else {
-			DEBUGP("Setting vtag %x for new conn OOTB\n",
-				sh->vtag);
+			pr_debug("Setting vtag %x for new conn OOTB\n",
+				 sh->vtag);
 			conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
 		}
 
@@ -688,8 +659,6 @@ int __init nf_conntrack_proto_sctp_init(void)
  cleanup_sctp4:
 	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
  out:
-	DEBUGP("SCTP conntrack module loading %s\n",
-					ret ? "failed": "succeeded");
 	return ret;
 }
 
@@ -697,7 +666,6 @@ void __exit nf_conntrack_proto_sctp_fini(void)
 {
 	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp6);
 	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
-	DEBUGP("SCTP conntrack module unloaded\n");
 }
 
 module_init(nf_conntrack_proto_sctp_init);
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index ccdd5d231e0d..1c8206e6560a 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -26,13 +26,6 @@
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
 
-#if 0
-#define DEBUGP printk
-#define DEBUGP_VARS
-#else
-#define DEBUGP(format, args...)
-#endif
-
 /* Protects conntrack->proto.tcp */
 static DEFINE_RWLOCK(tcp_lock);
 
@@ -496,7 +489,8 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
 	}
 }
 
-static int tcp_in_window(struct ip_ct_tcp *state,
+static int tcp_in_window(struct nf_conn *ct,
+			 struct ip_ct_tcp *state,
 			 enum ip_conntrack_dir dir,
 			 unsigned int index,
 			 const struct sk_buff *skb,
@@ -506,6 +500,7 @@ static int tcp_in_window(struct ip_ct_tcp *state,
 {
 	struct ip_ct_tcp_state *sender = &state->seen[dir];
 	struct ip_ct_tcp_state *receiver = &state->seen[!dir];
+	struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
 	__u32 seq, ack, sack, end, win, swin;
 	int res;
 
@@ -520,18 +515,17 @@ static int tcp_in_window(struct ip_ct_tcp *state,
 	if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
 		tcp_sack(skb, dataoff, tcph, &sack);
 
-	DEBUGP("tcp_in_window: START\n");
-	DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
-	       "seq=%u ack=%u sack=%u win=%u end=%u\n",
-		NIPQUAD(iph->saddr), ntohs(tcph->source),
-		NIPQUAD(iph->daddr), ntohs(tcph->dest),
-		seq, ack, sack, win, end);
-	DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
-	       "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
-		sender->td_end, sender->td_maxend, sender->td_maxwin,
-		sender->td_scale,
-		receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
-		receiver->td_scale);
+	pr_debug("tcp_in_window: START\n");
+	pr_debug("tcp_in_window: ");
+	NF_CT_DUMP_TUPLE(tuple);
+	pr_debug("seq=%u ack=%u sack=%u win=%u end=%u\n",
+		 seq, ack, sack, win, end);
+	pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
+		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+		 sender->td_end, sender->td_maxend, sender->td_maxwin,
+		 sender->td_scale,
+		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+		 receiver->td_scale);
 
 	if (sender->td_end == 0) {
 		/*
@@ -609,23 +603,22 @@ static int tcp_in_window(struct ip_ct_tcp *state,
 		 */
 		seq = end = sender->td_end;
 
-	DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
-	       "seq=%u ack=%u sack =%u win=%u end=%u\n",
-		NIPQUAD(iph->saddr), ntohs(tcph->source),
-		NIPQUAD(iph->daddr), ntohs(tcph->dest),
-		seq, ack, sack, win, end);
-	DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
-	       "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
-		sender->td_end, sender->td_maxend, sender->td_maxwin,
-		sender->td_scale,
-		receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
-		receiver->td_scale);
-
-	DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
-		before(seq, sender->td_maxend + 1),
-		after(end, sender->td_end - receiver->td_maxwin - 1),
-		before(sack, receiver->td_end + 1),
-		after(ack, receiver->td_end - MAXACKWINDOW(sender)));
+	pr_debug("tcp_in_window: ");
+	NF_CT_DUMP_TUPLE(tuple);
+	pr_debug("seq=%u ack=%u sack =%u win=%u end=%u\n",
+		 seq, ack, sack, win, end);
+	pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
+		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+		 sender->td_end, sender->td_maxend, sender->td_maxwin,
+		 sender->td_scale,
+		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+		 receiver->td_scale);
+
+	pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
+		 before(seq, sender->td_maxend + 1),
+		 after(end, sender->td_end - receiver->td_maxwin - 1),
+		 before(sack, receiver->td_end + 1),
+		 after(ack, receiver->td_end - MAXACKWINDOW(sender)));
 
 	if (before(seq, sender->td_maxend + 1) &&
 	    after(end, sender->td_end - receiver->td_maxwin - 1) &&
@@ -694,10 +687,10 @@ static int tcp_in_window(struct ip_ct_tcp *state,
 			: "SEQ is over the upper bound (over the window of the receiver)");
 	}
 
-	DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u "
-	       "receiver end=%u maxend=%u maxwin=%u\n",
-		res, sender->td_end, sender->td_maxend, sender->td_maxwin,
-		receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
+	pr_debug("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u "
+		 "receiver end=%u maxend=%u maxwin=%u\n",
+		 res, sender->td_end, sender->td_maxend, sender->td_maxwin,
+		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
 
 	return res;
 }
@@ -711,11 +704,9 @@ void nf_conntrack_tcp_update(struct sk_buff *skb,
 			     int dir)
 {
 	struct tcphdr *tcph = (void *)skb->data + dataoff;
-	__u32 end;
-#ifdef DEBUGP_VARS
 	struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir];
 	struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir];
-#endif
+	__u32 end;
 
 	end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph);
 
@@ -727,12 +718,12 @@ void nf_conntrack_tcp_update(struct sk_buff *skb,
 		conntrack->proto.tcp.seen[dir].td_end = end;
 	conntrack->proto.tcp.last_end = end;
 	write_unlock_bh(&tcp_lock);
-	DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
-	       "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
-		sender->td_end, sender->td_maxend, sender->td_maxwin,
-		sender->td_scale,
-		receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
-		receiver->td_scale);
+	pr_debug("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
+		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+		 sender->td_end, sender->td_maxend, sender->td_maxwin,
+		 sender->td_scale,
+		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+		 receiver->td_scale);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_tcp_update);
 #endif
@@ -823,6 +814,7 @@ static int tcp_packet(struct nf_conn *conntrack,
 		      int pf,
 		      unsigned int hooknum)
 {
+	struct nf_conntrack_tuple *tuple;
 	enum tcp_conntrack new_state, old_state;
 	enum ip_conntrack_dir dir;
 	struct tcphdr *th, _tcph;
@@ -837,6 +829,7 @@ static int tcp_packet(struct nf_conn *conntrack,
 	dir = CTINFO2DIR(ctinfo);
 	index = get_conntrack_index(th);
 	new_state = tcp_conntracks[dir][index][old_state];
+	tuple = &conntrack->tuplehash[dir].tuple;
 
 	switch (new_state) {
 	case TCP_CONNTRACK_IGNORE:
@@ -880,9 +873,8 @@ static int tcp_packet(struct nf_conn *conntrack,
 		return NF_ACCEPT;
 	case TCP_CONNTRACK_MAX:
 		/* Invalid packet */
-		DEBUGP("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
-		       dir, get_conntrack_index(th),
-		       old_state);
+		pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
+			 dir, get_conntrack_index(th), old_state);
 		write_unlock_bh(&tcp_lock);
 		if (LOG_INVALID(IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
@@ -933,7 +925,7 @@ static int tcp_packet(struct nf_conn *conntrack,
 		break;
 	}
 
-	if (!tcp_in_window(&conntrack->proto.tcp, dir, index,
+	if (!tcp_in_window(conntrack, &conntrack->proto.tcp, dir, index,
 			   skb, dataoff, th, pf)) {
 		write_unlock_bh(&tcp_lock);
 		return -NF_ACCEPT;
@@ -942,13 +934,12 @@ static int tcp_packet(struct nf_conn *conntrack,
 	/* From now on we have got in-window packets */
 	conntrack->proto.tcp.last_index = index;
 
-	DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
-	       "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
-		NIPQUAD(iph->saddr), ntohs(th->source),
-		NIPQUAD(iph->daddr), ntohs(th->dest),
-		(th->syn ? 1 : 0), (th->ack ? 1 : 0),
-		(th->fin ? 1 : 0), (th->rst ? 1 : 0),
-		old_state, new_state);
+	pr_debug("tcp_conntracks: ");
+	NF_CT_DUMP_TUPLE(tuple);
+	pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
+		 (th->syn ? 1 : 0), (th->ack ? 1 : 0),
+		 (th->fin ? 1 : 0), (th->rst ? 1 : 0),
+		 old_state, new_state);
 
 	conntrack->proto.tcp.state = new_state;
 	if (old_state != new_state
@@ -997,10 +988,8 @@ static int tcp_new(struct nf_conn *conntrack,
 {
 	enum tcp_conntrack new_state;
 	struct tcphdr *th, _tcph;
-#ifdef DEBUGP_VARS
 	struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0];
 	struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1];
-#endif
 
 	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
 	BUG_ON(th == NULL);
@@ -1012,7 +1001,7 @@ static int tcp_new(struct nf_conn *conntrack,
 
 	/* Invalid: delete conntrack */
 	if (new_state >= TCP_CONNTRACK_MAX) {
-		DEBUGP("nf_ct_tcp: invalid new deleting.\n");
+		pr_debug("nf_ct_tcp: invalid new deleting.\n");
 		return 0;
 	}
 
@@ -1065,12 +1054,12 @@ static int tcp_new(struct nf_conn *conntrack,
 	conntrack->proto.tcp.state = TCP_CONNTRACK_NONE;
 	conntrack->proto.tcp.last_index = TCP_NONE_SET;
 
-	DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
-	       "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
-		sender->td_end, sender->td_maxend, sender->td_maxwin,
-		sender->td_scale,
-		receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
-		receiver->td_scale);
+	pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
+		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+		 sender->td_end, sender->td_maxend, sender->td_maxwin,
+		 sender->td_scale,
+		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+		 receiver->td_scale);
 	return 1;
 }
 
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index 627eda79d154..355d371bac93 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -40,12 +40,6 @@ static u_int16_t ports[MAX_PORTS];
 static unsigned int ports_c;
 module_param_array(ports, ushort, &ports_c, 0400);
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 struct sane_request {
 	__be32 RPC_code;
 #define SANE_NET_START      7   /* RPC code */
@@ -125,15 +119,15 @@ static int help(struct sk_buff **pskb,
 	ct_sane_info->state = SANE_STATE_NORMAL;
 
 	if (datalen < sizeof(struct sane_reply_net_start)) {
-		DEBUGP("nf_ct_sane: NET_START reply too short\n");
+		pr_debug("nf_ct_sane: NET_START reply too short\n");
 		goto out;
 	}
 
 	reply = (struct sane_reply_net_start *)sb_ptr;
 	if (reply->status != htonl(SANE_STATUS_SUCCESS)) {
 		/* saned refused the command */
-		DEBUGP("nf_ct_sane: unsuccessful SANE_STATUS = %u\n",
-			ntohl(reply->status));
+		pr_debug("nf_ct_sane: unsuccessful SANE_STATUS = %u\n",
+			 ntohl(reply->status));
 		goto out;
 	}
 
@@ -151,9 +145,8 @@ static int help(struct sk_buff **pskb,
 	nf_ct_expect_init(exp, family, &tuple->src.u3, &tuple->dst.u3,
 			  IPPROTO_TCP, NULL, &reply->port);
 
-	DEBUGP("nf_ct_sane: expect: ");
+	pr_debug("nf_ct_sane: expect: ");
 	NF_CT_DUMP_TUPLE(&exp->tuple);
-	NF_CT_DUMP_TUPLE(&exp->mask);
 
 	/* Can't expect this?  Best to drop packet now. */
 	if (nf_ct_expect_related(exp) != 0)
@@ -176,9 +169,9 @@ static void nf_conntrack_sane_fini(void)
 
 	for (i = 0; i < ports_c; i++) {
 		for (j = 0; j < 2; j++) {
-			DEBUGP("nf_ct_sane: unregistering helper for pf: %d "
-			       "port: %d\n",
-				sane[i][j].tuple.src.l3num, ports[i]);
+			pr_debug("nf_ct_sane: unregistering helper for pf: %d "
+				 "port: %d\n",
+				 sane[i][j].tuple.src.l3num, ports[i]);
 			nf_conntrack_helper_unregister(&sane[i][j]);
 		}
 	}
@@ -217,9 +210,9 @@ static int __init nf_conntrack_sane_init(void)
 				sprintf(tmpname, "sane-%d", ports[i]);
 			sane[i][j].name = tmpname;
 
-			DEBUGP("nf_ct_sane: registering helper for pf: %d "
-			       "port: %d\n",
-				sane[i][j].tuple.src.l3num, ports[i]);
+			pr_debug("nf_ct_sane: registering helper for pf: %d "
+				 "port: %d\n",
+				 sane[i][j].tuple.src.l3num, ports[i]);
 			ret = nf_conntrack_helper_register(&sane[i][j]);
 			if (ret) {
 				printk(KERN_ERR "nf_ct_sane: failed to "
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 5b78f0e1f63b..1276a442f10c 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -21,12 +21,6 @@
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <linux/netfilter/nf_conntrack_sip.h>
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
 MODULE_DESCRIPTION("SIP connection tracking helper");
@@ -285,7 +279,7 @@ static int epaddr_len(struct nf_conn *ct, const char *dptr,
 	const char *aux = dptr;
 
 	if (!parse_addr(ct, dptr, &dptr, &addr, limit)) {
-		DEBUGP("ip: %s parse failed.!\n", dptr);
+		pr_debug("ip: %s parse failed.!\n", dptr);
 		return 0;
 	}
 
@@ -344,8 +338,8 @@ int ct_sip_get_info(struct nf_conn *ct,
 				    ct_sip_lnlen(dptr, limit),
 				    hnfo->case_sensitive);
 		if (!aux) {
-			DEBUGP("'%s' not found in '%s'.\n", hnfo->ln_str,
-			       hnfo->lname);
+			pr_debug("'%s' not found in '%s'.\n", hnfo->ln_str,
+				 hnfo->lname);
 			return -1;
 		}
 		aux += hnfo->ln_strlen;
@@ -356,11 +350,11 @@ int ct_sip_get_info(struct nf_conn *ct,
 
 		*matchoff = (aux - k) + shift;
 
-		DEBUGP("%s match succeeded! - len: %u\n", hnfo->lname,
-		       *matchlen);
+		pr_debug("%s match succeeded! - len: %u\n", hnfo->lname,
+			 *matchlen);
 		return 1;
 	}
-	DEBUGP("%s header not found.\n", hnfo->lname);
+	pr_debug("%s header not found.\n", hnfo->lname);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ct_sip_get_info);
@@ -424,7 +418,7 @@ static int sip_help(struct sk_buff **pskb,
 	if (!skb_is_nonlinear(*pskb))
 		dptr = (*pskb)->data + dataoff;
 	else {
-		DEBUGP("Copy of skbuff not supported yet.\n");
+		pr_debug("Copy of skbuff not supported yet.\n");
 		goto out;
 	}
 
@@ -518,7 +512,7 @@ static int __init nf_conntrack_sip_init(void)
 				sprintf(tmpname, "sip-%u", i);
 			sip[i][j].name = tmpname;
 
-			DEBUGP("port #%u: %u\n", i, ports[i]);
+			pr_debug("port #%u: %u\n", i, ports[i]);
 
 			ret = nf_conntrack_helper_register(&sip[i][j]);
 			if (ret) {
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 6af96c6e29fb..54498bcfa862 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -25,12 +25,6 @@
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
 MODULE_LICENSE("GPL");
 
 #ifdef CONFIG_PROC_FS
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
index db0387cf9bac..cc19506cf2f8 100644
--- a/net/netfilter/nf_conntrack_tftp.c
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -29,13 +29,6 @@ static int ports_c;
 module_param_array(ports, ushort, &ports_c, 0400);
 MODULE_PARM_DESC(ports, "Port numbers of TFTP servers");
 
-#if 0
-#define DEBUGP(format, args...) printk("%s:%s:" format, \
-				       __FILE__, __FUNCTION__ , ## args)
-#else
-#define DEBUGP(format, args...)
-#endif
-
 unsigned int (*nf_nat_tftp_hook)(struct sk_buff **pskb,
 				 enum ip_conntrack_info ctinfo,
 				 struct nf_conntrack_expect *exp) __read_mostly;
@@ -62,7 +55,6 @@ static int tftp_help(struct sk_buff **pskb,
 	case TFTP_OPCODE_READ:
 	case TFTP_OPCODE_WRITE:
 		/* RRQ and WRQ works the same way */
-		DEBUGP("");
 		NF_CT_DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 		NF_CT_DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 
@@ -73,9 +65,8 @@ static int tftp_help(struct sk_buff **pskb,
 		nf_ct_expect_init(exp, family, &tuple->src.u3, &tuple->dst.u3,
 				  IPPROTO_UDP, NULL, &tuple->dst.u.udp.port);
 
-		DEBUGP("expect: ");
+		pr_debug("expect: ");
 		NF_CT_DUMP_TUPLE(&exp->tuple);
-		NF_CT_DUMP_TUPLE(&exp->mask);
 
 		nf_nat_tftp = rcu_dereference(nf_nat_tftp_hook);
 		if (nf_nat_tftp && ct->status & IPS_NAT_MASK)
@@ -86,13 +77,13 @@ static int tftp_help(struct sk_buff **pskb,
 		break;
 	case TFTP_OPCODE_DATA:
 	case TFTP_OPCODE_ACK:
-		DEBUGP("Data/ACK opcode\n");
+		pr_debug("Data/ACK opcode\n");
 		break;
 	case TFTP_OPCODE_ERROR:
-		DEBUGP("Error opcode\n");
+		pr_debug("Error opcode\n");
 		break;
 	default:
-		DEBUGP("Unknown opcode\n");
+		pr_debug("Unknown opcode\n");
 	}
 	return ret;
 }
-- 
cgit v1.3-8-gc7d7


From ce7663d84a87bb4e1743f62950bf7dceed723a13 Mon Sep 17 00:00:00 2001
From: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Date: Sat, 7 Jul 2007 22:40:08 -0700
Subject: [NETFILTER]: nfnetlink_queue: don't unregister handler of other
 subsystem

The queue handlers registered by ip[6]_queue.ko at initialization should
not be unregistered according to requests from userland program
using nfnetlink_queue. If we allow that, there is no way to register
the handlers of built-in ip[6]_queue again.

Signed-off-by: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h       | 3 ++-
 net/netfilter/nf_queue.c        | 7 ++++++-
 net/netfilter/nfnetlink_queue.c | 4 +---
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 10b5c6275706..0eed0b7ab2df 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -275,7 +275,8 @@ struct nf_queue_handler {
 };
 extern int nf_register_queue_handler(int pf, 
                                      struct nf_queue_handler *qh);
-extern int nf_unregister_queue_handler(int pf);
+extern int nf_unregister_queue_handler(int pf,
+				       struct nf_queue_handler *qh);
 extern void nf_unregister_queue_handlers(struct nf_queue_handler *qh);
 extern void nf_reinject(struct sk_buff *skb,
 			struct nf_info *info,
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index b1f2ace96f6d..f402894d7e72 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -44,12 +44,17 @@ int nf_register_queue_handler(int pf, struct nf_queue_handler *qh)
 EXPORT_SYMBOL(nf_register_queue_handler);
 
 /* The caller must flush their queue before this */
-int nf_unregister_queue_handler(int pf)
+int nf_unregister_queue_handler(int pf, struct nf_queue_handler *qh)
 {
 	if (pf >= NPROTO)
 		return -EINVAL;
 
 	write_lock_bh(&queue_handler_lock);
+	if (queue_handler[pf] != qh) {
+		write_unlock_bh(&queue_handler_lock);
+		return -EINVAL;
+	}
+
 	queue_handler[pf] = NULL;
 	write_unlock_bh(&queue_handler_lock);
 
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 7a97bec67729..7d47fc4b19c6 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -913,9 +913,7 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 		case NFQNL_CFG_CMD_PF_UNBIND:
 			QDEBUG("unregistering queue handler for pf=%u\n",
 				ntohs(cmd->pf));
-			/* This is a bug and a feature.  We can unregister
-			 * other handlers(!) */
-			ret = nf_unregister_queue_handler(ntohs(cmd->pf));
+			ret = nf_unregister_queue_handler(ntohs(cmd->pf), &nfqh);
 			break;
 		default:
 			ret = -EINVAL;
-- 
cgit v1.3-8-gc7d7


From c6c6e3e05c0b4349824efcdd36650e7be9d5c7c3 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 10 Jul 2007 22:41:55 -0700
Subject: [NET]: Update comments for skb checksums

Rusty (whose comments we should all study and emulate :) pointed
out that our comments for skb checksums are no longer up-to-date.
So here is a patch to

1) add the case of partial checksums on input;
2) update partial checksum case to mention csum_start/csum_offset;
3) mention the new IPv6 feature bit.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 625d73b07ab7..9391e4a4c344 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -65,13 +65,20 @@
  *	    is able to produce some skb->csum, it MUST use COMPLETE,
  *	    not UNNECESSARY.
  *
+ *	PARTIAL: identical to the case for output below.  This may occur
+ *	    on a packet received directly from another Linux OS, e.g.,
+ *	    a virtualised Linux kernel on the same host.  The packet can
+ *	    be treated in the same way as UNNECESSARY except that on
+ *	    output (i.e., forwarding) the checksum must be filled in
+ *	    by the OS or the hardware.
+ *
  * B. Checksumming on output.
  *
  *	NONE: skb is checksummed by protocol or csum is not required.
  *
  *	PARTIAL: device is required to csum packet as seen by hard_start_xmit
- *	from skb->transport_header to the end and to record the checksum
- *	at skb->transport_header + skb->csum.
+ *	from skb->csum_start to the end and to record the checksum
+ *	at skb->csum_start + skb->csum_offset.
  *
  *	Device must show its capabilities in dev->features, set
  *	at device setup time.
@@ -82,6 +89,7 @@
  *			  TCP/UDP over IPv4. Sigh. Vendors like this
  *			  way by an unknown reason. Though, see comment above
  *			  about CHECKSUM_UNNECESSARY. 8)
+ *	NETIF_F_IPV6_CSUM about as dumb as the last one but does IPv6 instead.
  *
  *	Any questions? No questions, good. 		--ANK
  */
-- 
cgit v1.3-8-gc7d7


From bb4dbf9e61d0801927e7df2569bb3dd8287ea301 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Tue, 10 Jul 2007 22:55:49 -0700
Subject: [IPV6]: Do not send RH0 anymore.

Based on <draft-ietf-ipv6-deprecate-rh0-00.txt>.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  3 +-
 include/linux/ipv6.h                   |  4 +-
 include/net/ipv6.h                     |  4 --
 net/dccp/ipv6.c                        | 20 ---------
 net/ipv6/datagram.c                    |  3 +-
 net/ipv6/exthdrs.c                     | 78 ----------------------------------
 net/ipv6/ipv6_sockglue.c               |  3 +-
 net/ipv6/tcp_ipv6.c                    | 20 ---------
 8 files changed, 5 insertions(+), 130 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index af6a63ab9026..09c184e41cf8 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -874,8 +874,7 @@ accept_redirects - BOOLEAN
 accept_source_route - INTEGER
 	Accept source routing (routing extension header).
 
-	> 0: Accept routing header.
-	= 0: Accept only routing header type 2.
+	>= 0: Accept only routing header type 2.
 	< 0: Do not accept routing header.
 
 	Default: 0
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 213b63be3c8f..cb3118cf277c 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -27,8 +27,8 @@ struct in6_ifreq {
 	int		ifr6_ifindex; 
 };
 
-#define IPV6_SRCRT_STRICT	0x01	/* this hop must be a neighbor	*/
-#define IPV6_SRCRT_TYPE_0	0	/* IPv6 type 0 Routing Header	*/
+#define IPV6_SRCRT_STRICT	0x01	/* Deprecated; will be removed */
+#define IPV6_SRCRT_TYPE_0	0	/* Deprecated; will be removed */
 #define IPV6_SRCRT_TYPE_2	2	/* IPv6 type 2 Routing Header	*/
 
 /*
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 78a0d06d98d5..46b9dce82f6e 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -512,10 +512,6 @@ extern int 			ipv6_ext_hdr(u8 nexthdr);
 
 extern int ipv6_find_tlv(struct sk_buff *skb, int offset, int type);
 
-extern struct ipv6_txoptions *	ipv6_invert_rthdr(struct sock *sk,
-						  struct ipv6_rt_hdr *hdr);
-
-
 /*
  *	socket options (ipv6_sockglue.c)
  */
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 31737cdf156a..b158c661867b 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -253,17 +253,6 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
 
 	if (dst == NULL) {
 		opt = np->opt;
-		if (opt == NULL &&
-		    np->rxopt.bits.osrcrt == 2 &&
-		    ireq6->pktopts) {
-			struct sk_buff *pktopts = ireq6->pktopts;
-			struct inet6_skb_parm *rxopt = IP6CB(pktopts);
-
-			if (rxopt->srcrt)
-				opt = ipv6_invert_rthdr(sk,
-			  (struct ipv6_rt_hdr *)(skb_network_header(pktopts) +
-						 rxopt->srcrt));
-		}
 
 		if (opt != NULL && opt->srcrt != NULL) {
 			const struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt;
@@ -570,15 +559,6 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 	if (sk_acceptq_is_full(sk))
 		goto out_overflow;
 
-	if (np->rxopt.bits.osrcrt == 2 && opt == NULL && ireq6->pktopts) {
-		const struct inet6_skb_parm *rxopt = IP6CB(ireq6->pktopts);
-
-		if (rxopt->srcrt)
-			opt = ipv6_invert_rthdr(sk,
-		   (struct ipv6_rt_hdr *)(skb_network_header(ireq6->pktopts) +
-					  rxopt->srcrt));
-	}
-
 	if (dst == NULL) {
 		struct in6_addr *final_p = NULL, final;
 		struct flowi fl;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index ba1386dd4168..fe0f49024a0a 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -657,11 +657,10 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
 			rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg);
 
 			switch (rthdr->type) {
-			case IPV6_SRCRT_TYPE_0:
 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 			case IPV6_SRCRT_TYPE_2:
-#endif
 				break;
+#endif
 			default:
 				err = -EINVAL;
 				goto exit_f;
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index fc3a961fc5ba..c82d4d49f71f 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -427,18 +427,6 @@ looped_back:
 	}
 
 	switch (hdr->type) {
-	case IPV6_SRCRT_TYPE_0:
-		if (accept_source_route <= 0)
-			goto unknown_rh;
-		if (hdr->hdrlen & 0x01) {
-			IP6_INC_STATS_BH(ip6_dst_idev(skb->dst),
-					 IPSTATS_MIB_INHDRERRORS);
-			icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
-					  ((&hdr->hdrlen) -
-					   skb_network_header(skb)));
-			return -1;
-		}
-		break;
 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 	case IPV6_SRCRT_TYPE_2:
 		if (accept_source_route < 0)
@@ -576,72 +564,6 @@ void __init ipv6_rthdr_init(void)
 		printk(KERN_ERR "ipv6_rthdr_init: Could not register protocol\n");
 };
 
-/*
-   This function inverts received rthdr.
-   NOTE: specs allow to make it automatically only if
-   packet authenticated.
-
-   I will not discuss it here (though, I am really pissed off at
-   this stupid requirement making rthdr idea useless)
-
-   Actually, it creates severe problems  for us.
-   Embryonic requests has no associated sockets,
-   so that user have no control over it and
-   cannot not only to set reply options, but
-   even to know, that someone wants to connect
-   without success. :-(
-
-   For now we need to test the engine, so that I created
-   temporary (or permanent) backdoor.
-   If listening socket set IPV6_RTHDR to 2, then we invert header.
-						   --ANK (980729)
- */
-
-struct ipv6_txoptions *
-ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr)
-{
-	/* Received rthdr:
-
-	   [ H1 -> H2 -> ... H_prev ]  daddr=ME
-
-	   Inverted result:
-	   [ H_prev -> ... -> H1 ] daddr =sender
-
-	   Note, that IP output engine will rewrite this rthdr
-	   by rotating it left by one addr.
-	 */
-
-	int n, i;
-	struct rt0_hdr *rthdr = (struct rt0_hdr*)hdr;
-	struct rt0_hdr *irthdr;
-	struct ipv6_txoptions *opt;
-	int hdrlen = ipv6_optlen(hdr);
-
-	if (hdr->segments_left ||
-	    hdr->type != IPV6_SRCRT_TYPE_0 ||
-	    hdr->hdrlen & 0x01)
-		return NULL;
-
-	n = hdr->hdrlen >> 1;
-	opt = sock_kmalloc(sk, sizeof(*opt) + hdrlen, GFP_ATOMIC);
-	if (opt == NULL)
-		return NULL;
-	memset(opt, 0, sizeof(*opt));
-	opt->tot_len = sizeof(*opt) + hdrlen;
-	opt->srcrt = (void*)(opt+1);
-	opt->opt_nflen = hdrlen;
-
-	memcpy(opt->srcrt, hdr, sizeof(*hdr));
-	irthdr = (struct rt0_hdr*)opt->srcrt;
-	irthdr->reserved = 0;
-	opt->srcrt->segments_left = n;
-	for (i=0; i<n; i++)
-		memcpy(irthdr->addr+i, rthdr->addr+(n-1-i), 16);
-	return opt;
-}
-
-EXPORT_SYMBOL_GPL(ipv6_invert_rthdr);
-
 /**********************************
   Hop-by-hop options.
  **********************************/
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 1c3506696cb5..1841714ac419 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -416,11 +416,10 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		if (optname == IPV6_RTHDR && opt && opt->srcrt) {
 			struct ipv6_rt_hdr *rthdr = opt->srcrt;
 			switch (rthdr->type) {
-			case IPV6_SRCRT_TYPE_0:
 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 			case IPV6_SRCRT_TYPE_2:
-#endif
 				break;
+#endif
 			default:
 				goto sticky_done;
 			}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 193d9d60bb7a..d67fb1ef751e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -484,17 +484,6 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
 
 	if (dst == NULL) {
 		opt = np->opt;
-		if (opt == NULL &&
-		    np->rxopt.bits.osrcrt == 2 &&
-		    treq->pktopts) {
-			struct sk_buff *pktopts = treq->pktopts;
-			struct inet6_skb_parm *rxopt = IP6CB(pktopts);
-			if (rxopt->srcrt)
-				opt = ipv6_invert_rthdr(sk,
-			  (struct ipv6_rt_hdr *)(skb_network_header(pktopts) +
-						 rxopt->srcrt));
-		}
-
 		if (opt && opt->srcrt) {
 			struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
 			ipv6_addr_copy(&final, &fl.fl6_dst);
@@ -1391,15 +1380,6 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	if (sk_acceptq_is_full(sk))
 		goto out_overflow;
 
-	if (np->rxopt.bits.osrcrt == 2 &&
-	    opt == NULL && treq->pktopts) {
-		struct inet6_skb_parm *rxopt = IP6CB(treq->pktopts);
-		if (rxopt->srcrt)
-			opt = ipv6_invert_rthdr(sk,
-		   (struct ipv6_rt_hdr *)(skb_network_header(treq->pktopts) +
-					  rxopt->srcrt));
-	}
-
 	if (dst == NULL) {
 		struct in6_addr *final_p = NULL, final;
 		struct flowi fl;
-- 
cgit v1.3-8-gc7d7


From 4c752098f529f41abfc985426a3eca0f2cb96676 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Wed, 23 May 2007 13:28:48 +0900
Subject: [IPV6]: Make IPV6_{RECV,2292}RTHDR boolean options.

Because reversing RH0 is no longer supported by deprecation
of RH0, let's make IPV6_{RECV,2292}RTHDR boolean options.
Boolean are more appropriate from standard POV.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     | 4 ++--
 net/ipv6/ipv6_sockglue.c | 8 ++------
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index cb3118cf277c..97983dc9df13 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -299,8 +299,8 @@ struct ipv6_pinfo {
 	/* pktoption flags */
 	union {
 		struct {
-			__u16	srcrt:2,
-				osrcrt:2,
+			__u16	srcrt:1,
+				osrcrt:1,
 			        rxinfo:1,
 			        rxoinfo:1,
 				rxhlim:1,
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 1841714ac419..d6846393182d 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -336,16 +336,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_RECVRTHDR:
-		if (val < 0 || val > 2)
-			goto e_inval;
-		np->rxopt.bits.srcrt = val;
+		np->rxopt.bits.srcrt = valbool;
 		retv = 0;
 		break;
 
 	case IPV6_2292RTHDR:
-		if (val < 0 || val > 2)
-			goto e_inval;
-		np->rxopt.bits.osrcrt = val;
+		np->rxopt.bits.osrcrt = valbool;
 		retv = 0;
 		break;
 
-- 
cgit v1.3-8-gc7d7


From e69ff734e15eb7f61621f8764ce0a2181823a737 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 8 Jun 2007 16:26:08 +1000
Subject: [CRYPTO] cipher: Remove obsolete fields from cipher_tfm

This removes all the unused block cipher fields from cipher_tfm.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/crypto.h | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 0de7e2ace822..357e8cfedc37 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -295,28 +295,8 @@ struct blkcipher_tfm {
 };
 
 struct cipher_tfm {
-	void *cit_iv;
-	unsigned int cit_ivsize;
-	u32 cit_mode;
 	int (*cit_setkey)(struct crypto_tfm *tfm,
 	                  const u8 *key, unsigned int keylen);
-	int (*cit_encrypt)(struct crypto_tfm *tfm,
-			   struct scatterlist *dst,
-			   struct scatterlist *src,
-			   unsigned int nbytes);
-	int (*cit_encrypt_iv)(struct crypto_tfm *tfm,
-	                      struct scatterlist *dst,
-	                      struct scatterlist *src,
-	                      unsigned int nbytes, u8 *iv);
-	int (*cit_decrypt)(struct crypto_tfm *tfm,
-			   struct scatterlist *dst,
-			   struct scatterlist *src,
-			   unsigned int nbytes);
-	int (*cit_decrypt_iv)(struct crypto_tfm *tfm,
-			   struct scatterlist *dst,
-			   struct scatterlist *src,
-			   unsigned int nbytes, u8 *iv);
-	void (*cit_xor_block)(u8 *dst, const u8 *src);
 	void (*cit_encrypt_one)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
 	void (*cit_decrypt_one)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
 };
-- 
cgit v1.3-8-gc7d7


From d556ad4bbe75faf17b239e151a9f003322b2e851 Mon Sep 17 00:00:00 2001
From: Peter Oruba <peter.oruba@amd.com>
Date: Tue, 15 May 2007 13:59:13 +0200
Subject: PCI: add PCI-X/PCI-Express read control interfaces

This patch introduces an interface to read and write PCI-X / PCI-Express
maximum read byte count values from PCI config space. There is a second
function that returns the maximum _designed_ read byte count, which marks the
maximum value for a device, since some drivers try to set MMRBC to the
highest allowed value and rely on such a function.

Based on patch set by Stephen Hemminger <shemminger@linux-foundation.org>

Cc: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: Peter Oruba <peter.oruba@amd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/pci.c    | 160 +++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/quirks.c |  16 ++++++
 include/linux/pci.h  |   7 ++-
 3 files changed, 182 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index fd47ac0c4730..1bb879959a20 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1374,6 +1374,166 @@ pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask)
 }
 #endif
 
+/**
+ * pcix_get_max_mmrbc - get PCI-X maximum designed memory read byte count
+ * @dev: PCI device to query
+ *
+ * Returns mmrbc: maximum designed memory read count in bytes
+ *    or appropriate error value.
+ */
+int pcix_get_max_mmrbc(struct pci_dev *dev)
+{
+	int ret, err, cap;
+	u32 stat;
+
+	cap = pci_find_capability(dev, PCI_CAP_ID_PCIX);
+	if (!cap)
+		return -EINVAL;
+
+	err = pci_read_config_dword(dev, cap + PCI_X_STATUS, &stat);
+	if (err)
+		return -EINVAL;
+
+	ret = (stat & PCI_X_STATUS_MAX_READ) >> 12;
+
+	return ret;
+}
+EXPORT_SYMBOL(pcix_get_max_mmrbc);
+
+/**
+ * pcix_get_mmrbc - get PCI-X maximum memory read byte count
+ * @dev: PCI device to query
+ *
+ * Returns mmrbc: maximum memory read count in bytes
+ *    or appropriate error value.
+ */
+int pcix_get_mmrbc(struct pci_dev *dev)
+{
+	int ret, cap;
+	u32 cmd;
+
+	cap = pci_find_capability(dev, PCI_CAP_ID_PCIX);
+	if (!cap)
+		return -EINVAL;
+
+	ret = pci_read_config_dword(dev, cap + PCI_X_CMD, &cmd);
+	if (!ret)
+		ret = 512 << ((cmd & PCI_X_CMD_MAX_READ) >> 2);
+
+	return ret;
+}
+EXPORT_SYMBOL(pcix_get_mmrbc);
+
+/**
+ * pcix_set_mmrbc - set PCI-X maximum memory read byte count
+ * @dev: PCI device to query
+ * @mmrbc: maximum memory read count in bytes
+ *    valid values are 512, 1024, 2048, 4096
+ *
+ * If possible sets maximum memory read byte count, some bridges have erratas
+ * that prevent this.
+ */
+int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc)
+{
+	int cap, err = -EINVAL;
+	u32 stat, cmd, v, o;
+
+	if (mmrbc < 512 || mmrbc > 4096 || (mmrbc & (mmrbc-1)))
+		goto out;
+
+	v = ffs(mmrbc) - 10;
+
+	cap = pci_find_capability(dev, PCI_CAP_ID_PCIX);
+	if (!cap)
+		goto out;
+
+	err = pci_read_config_dword(dev, cap + PCI_X_STATUS, &stat);
+	if (err)
+		goto out;
+
+	if (v > (stat & PCI_X_STATUS_MAX_READ) >> 21)
+		return -E2BIG;
+
+	err = pci_read_config_dword(dev, cap + PCI_X_CMD, &cmd);
+	if (err)
+		goto out;
+
+	o = (cmd & PCI_X_CMD_MAX_READ) >> 2;
+	if (o != v) {
+		if (v > o && dev->bus &&
+		   (dev->bus->bus_flags & PCI_BUS_FLAGS_NO_MMRBC))
+			return -EIO;
+
+		cmd &= ~PCI_X_CMD_MAX_READ;
+		cmd |= v << 2;
+		err = pci_write_config_dword(dev, cap + PCI_X_CMD, cmd);
+	}
+out:
+	return err;
+}
+EXPORT_SYMBOL(pcix_set_mmrbc);
+
+/**
+ * pcie_get_readrq - get PCI Express read request size
+ * @dev: PCI device to query
+ *
+ * Returns maximum memory read request in bytes
+ *    or appropriate error value.
+ */
+int pcie_get_readrq(struct pci_dev *dev)
+{
+	int ret, cap;
+	u16 ctl;
+
+	cap = pci_find_capability(dev, PCI_CAP_ID_EXP);
+	if (!cap)
+		return -EINVAL;
+
+	ret = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl);
+	if (!ret)
+	ret = 128 << ((ctl & PCI_EXP_DEVCTL_READRQ) >> 12);
+
+	return ret;
+}
+EXPORT_SYMBOL(pcie_get_readrq);
+
+/**
+ * pcie_set_readrq - set PCI Express maximum memory read request
+ * @dev: PCI device to query
+ * @count: maximum memory read count in bytes
+ *    valid values are 128, 256, 512, 1024, 2048, 4096
+ *
+ * If possible sets maximum read byte count
+ */
+int pcie_set_readrq(struct pci_dev *dev, int rq)
+{
+	int cap, err = -EINVAL;
+	u16 ctl, v;
+
+	if (rq < 128 || rq > 4096 || (rq & (rq-1)))
+		goto out;
+
+	v = (ffs(rq) - 8) << 12;
+
+	cap = pci_find_capability(dev, PCI_CAP_ID_EXP);
+	if (!cap)
+		goto out;
+
+	err = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl);
+	if (err)
+		goto out;
+
+	if ((ctl & PCI_EXP_DEVCTL_READRQ) != v) {
+		ctl &= ~PCI_EXP_DEVCTL_READRQ;
+		ctl |= v;
+		err = pci_write_config_dword(dev, cap + PCI_EXP_DEVCTL, ctl);
+	}
+
+out:
+	return err;
+}
+EXPORT_SYMBOL(pcie_set_readrq);
+
 /**
  * pci_select_bars - Make BAR mask from the type of resource
  * @dev: the PCI device for which BAR mask is made
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 01d8f8a8843c..75bd6a8648f6 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -627,6 +627,22 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_8131_BRIDGE, quirk_
 DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_8131_BRIDGE, quirk_amd_8131_ioapic);
 #endif /* CONFIG_X86_IO_APIC */
 
+/*
+ * Some settings of MMRBC can lead to data corruption so block changes.
+ * See AMD 8131 HyperTransport PCI-X Tunnel Revision Guide
+ */
+static void __init quirk_amd_8131_mmrbc(struct pci_dev *dev)
+{
+	unsigned char revid;
+
+	pci_read_config_byte(dev, PCI_REVISION_ID, &revid);
+	if (dev->subordinate && revid <= 0x12) {
+		printk(KERN_INFO "AMD8131 rev %x detected, disabling PCI-X MMRBC\n",
+			revid);
+		dev->subordinate->bus_flags |= PCI_BUS_FLAGS_NO_MMRBC;
+	}
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_8131_BRIDGE, quirk_amd_8131_mmrbc);
 
 /*
  * FIXME: it is questionable that quirk_via_acpi
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 086a0e5a6318..ac403d74a222 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -111,7 +111,8 @@ enum pcie_reset_state {
 
 typedef unsigned short __bitwise pci_bus_flags_t;
 enum pci_bus_flags {
-	PCI_BUS_FLAGS_NO_MSI = (__force pci_bus_flags_t) 1,
+	PCI_BUS_FLAGS_NO_MSI   = (__force pci_bus_flags_t) 1,
+	PCI_BUS_FLAGS_NO_MMRBC = (__force pci_bus_flags_t) 2,
 };
 
 struct pci_cap_saved_state {
@@ -549,6 +550,10 @@ void pci_intx(struct pci_dev *dev, int enable);
 void pci_msi_off(struct pci_dev *dev);
 int pci_set_dma_mask(struct pci_dev *dev, u64 mask);
 int pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask);
+int pcix_get_max_mmrbc(struct pci_dev *dev);
+int pcix_get_mmrbc(struct pci_dev *dev);
+int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc);
+int pcie_set_readrq(struct pci_dev *dev, int rq);
 void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno);
 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
 int __must_check pci_assign_resource_fixed(struct pci_dev *dev, int i);
-- 
cgit v1.3-8-gc7d7


From 575e3348cb80c3265278756778d5091d5ca4efbf Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Tue, 8 May 2007 12:03:07 +1000
Subject: PCI: Use a weak symbol for the empty version of
 pcibios_add_platform_entries()

I'm not sure if this is going to fly, weak symbols work on the compilers I'm
using, but whether they work for all of the affected architectures I can't say.
I've cc'ed as many arch maintainers/lists as I could find.

But assuming they do, we can use a weak empty definition of
pcibios_add_platform_entries() to avoid having an empty definition on every
arch.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/ppc/kernel/pci.c       | 6 ------
 drivers/pci/pci-sysfs.c     | 5 +++++
 include/asm-alpha/pci.h     | 5 -----
 include/asm-arm/pci.h       | 4 ----
 include/asm-cris/pci.h      | 4 ----
 include/asm-frv/pci.h       | 4 ----
 include/asm-h8300/pci.h     | 4 ----
 include/asm-i386/pci.h      | 4 ----
 include/asm-ia64/pci.h      | 4 ----
 include/asm-m68k/pci.h      | 4 ----
 include/asm-m68knommu/pci.h | 4 ----
 include/asm-mips/pci.h      | 4 ----
 include/asm-parisc/pci.h    | 4 ----
 include/asm-powerpc/pci.h   | 2 --
 include/asm-ppc/pci.h       | 2 --
 include/asm-sh/pci.h        | 4 ----
 include/asm-sh64/pci.h      | 4 ----
 include/asm-sparc/pci.h     | 4 ----
 include/asm-sparc64/pci.h   | 4 ----
 include/asm-v850/pci.h      | 4 ----
 include/asm-x86_64/pci.h    | 4 ----
 include/asm-xtensa/pci.h    | 4 ----
 include/linux/pci.h         | 2 ++
 23 files changed, 7 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ppc/kernel/pci.c b/arch/ppc/kernel/pci.c
index 5e723c4c2571..c2ec13bea006 100644
--- a/arch/ppc/kernel/pci.c
+++ b/arch/ppc/kernel/pci.c
@@ -633,12 +633,6 @@ void pcibios_make_OF_bus_map(void)
 {
 }
 
-/* Add sysfs properties */
-void pcibios_add_platform_entries(struct pci_dev *pdev)
-{
-}
-
-
 static int __init
 pcibios_init(void)
 {
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 284e83a527f9..e5737f0b3997 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -600,6 +600,11 @@ static struct bin_attribute pcie_config_attr = {
 	.write = pci_write_config,
 };
 
+void __attribute__ ((weak)) pcibios_add_platform_entries(struct pci_dev *dev)
+{
+	return;
+}
+
 int __must_check pci_create_sysfs_dev_files (struct pci_dev *pdev)
 {
 	struct bin_attribute *rom_attr = NULL;
diff --git a/include/asm-alpha/pci.h b/include/asm-alpha/pci.h
index 85aa1127c903..635d6f2b0b03 100644
--- a/include/asm-alpha/pci.h
+++ b/include/asm-alpha/pci.h
@@ -275,11 +275,6 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 	return hose->need_domain_info;
 }
 
-static inline void
-pcibios_add_platform_entries(struct pci_dev *dev)
-{
-}
-
 struct pci_dev *alpha_gendev_to_pci(struct device *dev);
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-arm/pci.h b/include/asm-arm/pci.h
index f21abd4ddac6..9299a3c1ac37 100644
--- a/include/asm-arm/pci.h
+++ b/include/asm-arm/pci.h
@@ -76,10 +76,6 @@ pcibios_select_root(struct pci_dev *pdev, struct resource *res)
 	return root;
 }
 
-static inline void pcibios_add_platform_entries(struct pci_dev *dev)
-{
-}
-
 #endif /* __KERNEL__ */
  
 #endif
diff --git a/include/asm-cris/pci.h b/include/asm-cris/pci.h
index b2ac8a331da1..5f1986ef7db1 100644
--- a/include/asm-cris/pci.h
+++ b/include/asm-cris/pci.h
@@ -89,10 +89,6 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			       enum pci_mmap_state mmap_state, int write_combine);
 
 
-static inline void pcibios_add_platform_entries(struct pci_dev *dev)
-{
-}
-
 #endif /* __KERNEL__ */
 
 /* implement the pci_ DMA API in terms of the generic device dma_ one */
diff --git a/include/asm-frv/pci.h b/include/asm-frv/pci.h
index f35a4511e7b9..3aee08c5a44f 100644
--- a/include/asm-frv/pci.h
+++ b/include/asm-frv/pci.h
@@ -22,10 +22,6 @@ struct pci_dev;
 
 #define pcibios_assign_all_busses()	0
 
-static inline void pcibios_add_platform_entries(struct pci_dev *dev)
-{
-}
-
 extern void pcibios_set_master(struct pci_dev *dev);
 
 extern void pcibios_penalize_isa_irq(int irq);
diff --git a/include/asm-h8300/pci.h b/include/asm-h8300/pci.h
index 0c771b05fdd5..97389b35aa35 100644
--- a/include/asm-h8300/pci.h
+++ b/include/asm-h8300/pci.h
@@ -22,8 +22,4 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
 
 #define PCI_DMA_BUS_IS_PHYS	(1)
 
-static inline void pcibios_add_platform_entries(struct pci_dev *dev)
-{
-}
-
 #endif /* _ASM_H8300_PCI_H */
diff --git a/include/asm-i386/pci.h b/include/asm-i386/pci.h
index 64b6d0baedbc..b974bd8ce9c6 100644
--- a/include/asm-i386/pci.h
+++ b/include/asm-i386/pci.h
@@ -94,10 +94,6 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			       enum pci_mmap_state mmap_state, int write_combine);
 
 
-static inline void pcibios_add_platform_entries(struct pci_dev *dev)
-{
-}
-
 #ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
diff --git a/include/asm-ia64/pci.h b/include/asm-ia64/pci.h
index 5a5d1c2ce39d..26b69280f6c7 100644
--- a/include/asm-ia64/pci.h
+++ b/include/asm-ia64/pci.h
@@ -143,10 +143,6 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 	return (pci_domain_nr(bus) != 0);
 }
 
-static inline void pcibios_add_platform_entries(struct pci_dev *dev)
-{
-}
-
 extern void pcibios_resource_to_bus(struct pci_dev *dev,
 		struct pci_bus_region *region, struct resource *res);
 
diff --git a/include/asm-m68k/pci.h b/include/asm-m68k/pci.h
index 9d2c07abe44f..678cb0b52314 100644
--- a/include/asm-m68k/pci.h
+++ b/include/asm-m68k/pci.h
@@ -54,8 +54,4 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
  */
 #define PCI_DMA_BUS_IS_PHYS	(1)
 
-static inline void pcibios_add_platform_entries(struct pci_dev *dev)
-{
-}
-
 #endif /* _ASM_M68K_PCI_H */
diff --git a/include/asm-m68knommu/pci.h b/include/asm-m68knommu/pci.h
index e04c77e1184d..a99ce768f4a6 100644
--- a/include/asm-m68knommu/pci.h
+++ b/include/asm-m68knommu/pci.h
@@ -30,10 +30,6 @@ static inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask)
  */
 #define pci_dac_dma_supported(pci_dev, mask) (0)
 
-static inline void pcibios_add_platform_entries(struct pci_dev *dev)
-{
-}
-
 #endif /* CONFIG_COMEMPCI */
 
 #endif /* M68KNOMMU_PCI_H */
diff --git a/include/asm-mips/pci.h b/include/asm-mips/pci.h
index a59d54749eef..6e8c5540bd27 100644
--- a/include/asm-mips/pci.h
+++ b/include/asm-mips/pci.h
@@ -181,10 +181,6 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 /* implement the pci_ DMA API in terms of the generic device dma_ one */
 #include <asm-generic/pci-dma-compat.h>
 
-static inline void pcibios_add_platform_entries(struct pci_dev *dev)
-{
-}
-
 /* Do platform specific device initialization at pci_enable_device() time */
 extern int pcibios_plat_dev_init(struct pci_dev *dev);
 
diff --git a/include/asm-parisc/pci.h b/include/asm-parisc/pci.h
index 7b3be9ac0dda..c331d49e4584 100644
--- a/include/asm-parisc/pci.h
+++ b/include/asm-parisc/pci.h
@@ -284,10 +284,6 @@ pcibios_select_root(struct pci_dev *pdev, struct resource *res)
 	return root;
 }
 
-static inline void pcibios_add_platform_entries(struct pci_dev *dev)
-{
-}
-
 static inline void pcibios_penalize_isa_irq(int irq, int active)
 {
 	/* We don't need to penalize isa irq's */
diff --git a/include/asm-powerpc/pci.h b/include/asm-powerpc/pci.h
index ce0f13e8eb14..b36a284bedf3 100644
--- a/include/asm-powerpc/pci.h
+++ b/include/asm-powerpc/pci.h
@@ -243,8 +243,6 @@ extern void of_scan_bus(struct device_node *node, struct pci_bus *bus);
 
 extern int pci_read_irq_line(struct pci_dev *dev);
 
-extern void pcibios_add_platform_entries(struct pci_dev *dev);
-
 struct file;
 extern pgprot_t	pci_phys_mem_access_prot(struct file *file,
 					 unsigned long pfn,
diff --git a/include/asm-ppc/pci.h b/include/asm-ppc/pci.h
index 9d162028dab9..0a66a6f1059a 100644
--- a/include/asm-ppc/pci.h
+++ b/include/asm-ppc/pci.h
@@ -145,8 +145,6 @@ pcibios_select_root(struct pci_dev *pdev, struct resource *res)
 	return root;
 }
 
-extern void pcibios_add_platform_entries(struct pci_dev *dev);
-
 struct file;
 extern pgprot_t	pci_phys_mem_access_prot(struct file *file,
 					 unsigned long pfn,
diff --git a/include/asm-sh/pci.h b/include/asm-sh/pci.h
index b1f9a9e0231e..6f741f3a5d45 100644
--- a/include/asm-sh/pci.h
+++ b/include/asm-sh/pci.h
@@ -134,10 +134,6 @@ int pcibios_map_platform_irq(struct pci_dev *dev, u8 slot, u8 pin);
 int pciauto_assign_resources(int busno, struct pci_channel *hose);
 #endif
 
-static inline void pcibios_add_platform_entries(struct pci_dev *dev)
-{
-}
-
 #endif /* __KERNEL__ */
 
 /* generic pci stuff */
diff --git a/include/asm-sh64/pci.h b/include/asm-sh64/pci.h
index aa8043089bb6..0a2b2bd48b87 100644
--- a/include/asm-sh64/pci.h
+++ b/include/asm-sh64/pci.h
@@ -104,10 +104,6 @@ extern void pcibios_fixup_irqs(void);
 extern int pciauto_assign_resources(int busno, struct pci_channel *hose);
 #endif
 
-static inline void pcibios_add_platform_entries(struct pci_dev *dev)
-{
-}
-
 #endif /* __KERNEL__ */
 
 /* generic pci stuff */
diff --git a/include/asm-sparc/pci.h b/include/asm-sparc/pci.h
index a750c688408b..a1ff7acc7644 100644
--- a/include/asm-sparc/pci.h
+++ b/include/asm-sparc/pci.h
@@ -154,10 +154,6 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 }
 #endif
 
-static inline void pcibios_add_platform_entries(struct pci_dev *dev)
-{
-}
-
 #define PCI_DMA_ERROR_CODE      (~(dma_addr_t)0x0)
 
 static inline int pci_dma_mapping_error(dma_addr_t dma_addr)
diff --git a/include/asm-sparc64/pci.h b/include/asm-sparc64/pci.h
index 47cea16e1bad..202915d82759 100644
--- a/include/asm-sparc64/pci.h
+++ b/include/asm-sparc64/pci.h
@@ -303,10 +303,6 @@ pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
 
 extern struct resource *pcibios_select_root(struct pci_dev *, struct resource *);
 
-static inline void pcibios_add_platform_entries(struct pci_dev *dev)
-{
-}
-
 static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 {
 	return PCI_IRQ_NONE;
diff --git a/include/asm-v850/pci.h b/include/asm-v850/pci.h
index 4581826e1cac..de2a7d0a81cc 100644
--- a/include/asm-v850/pci.h
+++ b/include/asm-v850/pci.h
@@ -116,8 +116,4 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
 extern void pci_iounmap (struct pci_dev *dev, void __iomem *addr);
 
-static inline void pcibios_add_platform_entries(struct pci_dev *dev)
-{
-}
-
 #endif /* __V850_PCI_H__ */
diff --git a/include/asm-x86_64/pci.h b/include/asm-x86_64/pci.h
index 49c5e9280598..d95c9e78fe1d 100644
--- a/include/asm-x86_64/pci.h
+++ b/include/asm-x86_64/pci.h
@@ -135,10 +135,6 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			       enum pci_mmap_state mmap_state, int write_combine);
 
-static inline void pcibios_add_platform_entries(struct pci_dev *dev)
-{
-}
-
 #endif /* __KERNEL__ */
 
 /* generic pci stuff */
diff --git a/include/asm-xtensa/pci.h b/include/asm-xtensa/pci.h
index 24eb7fc25da8..644411c8e0ca 100644
--- a/include/asm-xtensa/pci.h
+++ b/include/asm-xtensa/pci.h
@@ -74,10 +74,6 @@ int pci_mmap_page_range(struct pci_dev *pdev, struct vm_area_struct *vma,
 /* Tell drivers/pci/proc.c that we have pci_mmap_page_range() */
 #define HAVE_PCI_MMAP	1
 
-static inline void pcibios_add_platform_entries(struct pci_dev *dev)
-{
-}
-
 #endif /* __KERNEL__ */
 
 /* Implement the pci_ DMA API in terms of the generic device dma_ one */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index ac403d74a222..18319aba1a57 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -881,5 +881,7 @@ extern int pci_pci_problems;
 extern unsigned long pci_cardbus_io_size;
 extern unsigned long pci_cardbus_mem_size;
 
+extern void pcibios_add_platform_entries(struct pci_dev *dev);
+
 #endif /* __KERNEL__ */
 #endif /* LINUX_PCI_H */
-- 
cgit v1.3-8-gc7d7


From a2cd52ca904f5913651e71764755e712894ccc2f Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Tue, 8 May 2007 12:03:08 +1000
Subject: PCI: Make pcibios_add_platform_entries() return errors

Currently pcibios_add_platform_entries() returns void, but could fail,
so instead have it return an int and propagate errors up to
pci_create_sysfs_dev_files().

Fixes:
arch/powerpc/kernel/pci_64.c: In function 'pcibios_add_platform_entries':
arch/powerpc/kernel/pci_64.c:878: warning: ignoring return value of
	'device_create_file', declared with attribute warn_unused_result
arch/powerpc/kernel/pci_32.c: In function 'pcibios_add_platform_entries':
  arch/powerpc/kernel/pci_32.c:1043: warning: ignoring return value of
	'device_create_file', declared with attribute warn_unused_result

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/powerpc/kernel/pci_32.c |  4 ++--
 arch/powerpc/kernel/pci_64.c |  4 ++--
 drivers/pci/pci-sysfs.c      | 10 +++++++---
 include/linux/pci.h          |  2 +-
 4 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c
index e66064b5093a..86982112b0dd 100644
--- a/arch/powerpc/kernel/pci_32.c
+++ b/arch/powerpc/kernel/pci_32.c
@@ -1047,10 +1047,10 @@ void pcibios_make_OF_bus_map(void)
 #endif /* CONFIG_PPC_OF */
 
 /* Add sysfs properties */
-void pcibios_add_platform_entries(struct pci_dev *pdev)
+int pcibios_add_platform_entries(struct pci_dev *pdev)
 {
 #ifdef CONFIG_PPC_OF
-	device_create_file(&pdev->dev, &dev_attr_devspec);
+	return device_create_file(&pdev->dev, &dev_attr_devspec);
 #endif /* CONFIG_PPC_OF */
 }
 
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index 249cca27a9b8..96d393c2da02 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -876,9 +876,9 @@ static ssize_t pci_show_devspec(struct device *dev,
 }
 static DEVICE_ATTR(devspec, S_IRUGO, pci_show_devspec, NULL);
 
-void pcibios_add_platform_entries(struct pci_dev *pdev)
+int pcibios_add_platform_entries(struct pci_dev *pdev)
 {
-	device_create_file(&pdev->dev, &dev_attr_devspec);
+	return device_create_file(&pdev->dev, &dev_attr_devspec);
 }
 
 #define ISA_SPACE_MASK 0x1
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index e5737f0b3997..9c4a123eacd1 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -600,9 +600,9 @@ static struct bin_attribute pcie_config_attr = {
 	.write = pci_write_config,
 };
 
-void __attribute__ ((weak)) pcibios_add_platform_entries(struct pci_dev *dev)
+int __attribute__ ((weak)) pcibios_add_platform_entries(struct pci_dev *dev)
 {
-	return;
+	return 0;
 }
 
 int __must_check pci_create_sysfs_dev_files (struct pci_dev *pdev)
@@ -645,10 +645,14 @@ int __must_check pci_create_sysfs_dev_files (struct pci_dev *pdev)
 		}
 	}
 	/* add platform-specific attributes */
-	pcibios_add_platform_entries(pdev);
+	if (pcibios_add_platform_entries(pdev))
+		goto err_rom_file;
 
 	return 0;
 
+err_rom_file:
+	if (pci_resource_len(pdev, PCI_ROM_RESOURCE))
+		sysfs_remove_bin_file(&pdev->dev.kobj, rom_attr);
 err_rom:
 	kfree(rom_attr);
 err_resource_files:
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 18319aba1a57..483db814770e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -881,7 +881,7 @@ extern int pci_pci_problems;
 extern unsigned long pci_cardbus_io_size;
 extern unsigned long pci_cardbus_mem_size;
 
-extern void pcibios_add_platform_entries(struct pci_dev *dev);
+extern int pcibios_add_platform_entries(struct pci_dev *dev);
 
 #endif /* __KERNEL__ */
 #endif /* LINUX_PCI_H */
-- 
cgit v1.3-8-gc7d7


From adf809d01043d8808e47db2d35fc07b53062884e Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Mon, 21 May 2007 14:16:17 -0700
Subject: + pci_find_slot-mark-deprecated.patch added to -mm tree

We've now fixed up most users of pci_find_slot, and the remainder are either
hard and need someone with the hardware and info to work on it, or patches
exist but are not yet merged.

Time therefore for some gentle encouragement

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/pci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 483db814770e..4db7b5a18f58 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -476,7 +476,7 @@ extern void pci_sort_breadthfirst(void);
 /* Generic PCI functions exported to card drivers */
 
 struct pci_dev __deprecated *pci_find_device (unsigned int vendor, unsigned int device, const struct pci_dev *from);
-struct pci_dev *pci_find_slot (unsigned int bus, unsigned int devfn);
+struct pci_dev __deprecated *pci_find_slot (unsigned int bus, unsigned int devfn);
 int pci_find_capability (struct pci_dev *dev, int cap);
 int pci_find_next_capability (struct pci_dev *dev, u8 pos, int cap);
 int pci_find_ext_capability (struct pci_dev *dev, int cap);
-- 
cgit v1.3-8-gc7d7


From 65b3bc358a3195ebe459761a248cf33a61539947 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Wed, 6 Jun 2007 11:46:49 +0800
Subject: PCI aer: fix stub return values

The stubs used when advanced error reporting is not enabled
must have same return type as real functions.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Acked-by: Zhang Yanmin <yanmin.zhang@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/aer.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/aer.h b/include/linux/aer.h
index 402e178b38eb..64aacaed8d6c 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -14,10 +14,10 @@ extern int pci_find_aer_capability(struct pci_dev *dev);
 extern int pci_disable_pcie_error_reporting(struct pci_dev *dev);
 extern int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
 #else
-#define pci_enable_pcie_error_reporting(dev)		do { } while (0)
-#define pci_find_aer_capability(dev)			do { } while (0)
-#define pci_disable_pcie_error_reporting(dev)		do { } while (0)
-#define pci_cleanup_aer_uncorrect_error_status(dev)	do { } while (0)
+#define pci_enable_pcie_error_reporting(dev)		(-EINVAL)
+#define pci_find_aer_capability(dev)			(0)
+#define pci_disable_pcie_error_reporting(dev)		(-EINVAL)
+#define pci_cleanup_aer_uncorrect_error_status(dev)	(-EINVAL)
 #endif
 
 #endif //_AER_H_
-- 
cgit v1.3-8-gc7d7


From f0dce411930d16a678173e534594bca160f5eaff Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Wed, 6 Jun 2007 11:50:34 +0800
Subject: PCI aer: add pci_cleanup_aer_correct_aer_status

Function to clear bogus correctable errors. Analog to pci_aer_uncorrect_are_status.
The Marvell chips seem to start out with a bogus value that needs to be
cleared.

Yanmin ported it to 2.6.22-rc4 by fixing a fuzz patch applying info.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Acked-by: Zhang Yanmin <yanmin.zhang@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/pcie/aer/aerdrv_core.c | 16 ++++++++++++++++
 include/linux/aer.h                |  2 ++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index fef159a68081..92a8469b21ba 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -117,6 +117,21 @@ int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
 	return 0;
 }
 
+int pci_cleanup_aer_correct_error_status(struct pci_dev *dev)
+{
+	int pos;
+	u32 status;
+
+	pos = pci_find_aer_capability(dev);
+	if (!pos)
+		return -EIO;
+
+	pci_read_config_dword(dev, pos + PCI_ERR_COR_STATUS, &status);
+	pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS, status);
+
+	return 0;
+}
+
 static int find_device_iter(struct device *device, void *data)
 {
 	struct pci_dev *dev;
@@ -741,4 +756,5 @@ EXPORT_SYMBOL_GPL(pci_find_aer_capability);
 EXPORT_SYMBOL_GPL(pci_enable_pcie_error_reporting);
 EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting);
 EXPORT_SYMBOL_GPL(pci_cleanup_aer_uncorrect_error_status);
+EXPORT_SYMBOL_GPL(pci_cleanup_aer_correct_error_status);
 
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 64aacaed8d6c..509656286e53 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -13,11 +13,13 @@ extern int pci_enable_pcie_error_reporting(struct pci_dev *dev);
 extern int pci_find_aer_capability(struct pci_dev *dev);
 extern int pci_disable_pcie_error_reporting(struct pci_dev *dev);
 extern int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
+extern int pci_cleanup_aer_correct_error_status(struct pci_dev *dev);
 #else
 #define pci_enable_pcie_error_reporting(dev)		(-EINVAL)
 #define pci_find_aer_capability(dev)			(0)
 #define pci_disable_pcie_error_reporting(dev)		(-EINVAL)
 #define pci_cleanup_aer_uncorrect_error_status(dev)	(-EINVAL)
+#define pci_cleanup_aer_correct_error_status(dev)	(-EINVAL)
 #endif
 
 #endif //_AER_H_
-- 
cgit v1.3-8-gc7d7


From 56906c612e10b5e32a48ccbe8a3c08ab6acf5a28 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Mon, 7 May 2007 10:26:17 -0700
Subject: PCI: remove useless pci driver method

Remove pointless and never-called enable_wake() hook from pci_driver and
from documentation.  Evidently this was introduced in the 2.4.6 kernel,
but there's no evidence it was ever called; and it was rarely implemented.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/pci.txt                         |  3 ---
 Documentation/power/pci.txt                   | 37 ---------------------------
 drivers/net/typhoon.c                         |  7 -----
 drivers/net/wireless/hostap/hostap_pci.c      |  2 --
 drivers/net/wireless/hostap/hostap_plx.c      |  3 ---
 drivers/net/wireless/prism54/islpci_hotplug.c |  1 -
 drivers/scsi/nsp32.c                          | 10 --------
 include/linux/pci.h                           |  1 -
 8 files changed, 64 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/pci.txt b/Documentation/pci.txt
index d38261b67905..7d3da30ff0b4 100644
--- a/Documentation/pci.txt
+++ b/Documentation/pci.txt
@@ -113,9 +113,6 @@ initialization with a pointer to a structure describing the driver
 		(Please see Documentation/power/pci.txt for descriptions
 		of PCI Power Management and the related functions.)
 
-	enable_wake	Enable device to generate wake events from a low power
-			state.
-
 	shutdown	Hook into reboot_notifier_list (kernel/sys.c).
 			Intended to stop any idling DMA operations.
 			Useful for enabling wake-on-lan (NIC) or changing
diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt
index e00b099a4b86..dd8fe43888d3 100644
--- a/Documentation/power/pci.txt
+++ b/Documentation/power/pci.txt
@@ -164,7 +164,6 @@ struct pci_driver:
 
         int  (*suspend) (struct pci_dev *dev, pm_message_t state);
         int  (*resume) (struct pci_dev *dev);
-        int  (*enable_wake) (struct pci_dev *dev, pci_power_t state, int enable);
 
 
 suspend
@@ -251,42 +250,6 @@ The driver should update the current_state field in its pci_dev structure in
 this function, except for PM-capable devices when pci_set_power_state is used.
 
 
-enable_wake
------------
-
-Usage:
-
-if (dev->driver && dev->driver->enable_wake)
-	dev->driver->enable_wake(dev,state,enable);
-
-This callback is generally only relevant for devices that support the PCI PM
-spec and have the ability to generate a PME# (Power Management Event Signal)
-to wake the system up. (However, it is possible that a device may support 
-some non-standard way of generating a wake event on sleep.)
-
-Bits 15:11 of the PMC (Power Mgmt Capabilities) Register in a device's
-PM Capabilities describe what power states the device supports generating a 
-wake event from:
-
-+------------------+
-|  Bit  |  State   |
-+------------------+
-|  11   |   D0     |
-|  12   |   D1     |
-|  13   |   D2     |
-|  14   |   D3hot  |
-|  15   |   D3cold |
-+------------------+
-
-A device can use this to enable wake events:
-
-	 pci_enable_wake(dev,state,enable);
-
-Note that to enable PME# from D3cold, a value of 4 should be passed to 
-pci_enable_wake (since it uses an index into a bitmask). If a driver gets
-a request to enable wake events from D3, two calls should be made to 
-pci_enable_wake (one for both D3hot and D3cold).
-
 
 A reference implementation
 -------------------------
diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c
index 15b2fb8aa492..0f2c0610ed71 100644
--- a/drivers/net/typhoon.c
+++ b/drivers/net/typhoon.c
@@ -2267,12 +2267,6 @@ need_resume:
 	typhoon_resume(pdev);
 	return -EBUSY;
 }
-
-static int
-typhoon_enable_wake(struct pci_dev *pdev, pci_power_t state, int enable)
-{
-	return pci_enable_wake(pdev, state, enable);
-}
 #endif
 
 static int __devinit
@@ -2636,7 +2630,6 @@ static struct pci_driver typhoon_driver = {
 #ifdef CONFIG_PM
 	.suspend	= typhoon_suspend,
 	.resume		= typhoon_resume,
-	.enable_wake	= typhoon_enable_wake,
 #endif
 };
 
diff --git a/drivers/net/wireless/hostap/hostap_pci.c b/drivers/net/wireless/hostap/hostap_pci.c
index 0cd48d151f5e..7da3664b8515 100644
--- a/drivers/net/wireless/hostap/hostap_pci.c
+++ b/drivers/net/wireless/hostap/hostap_pci.c
@@ -453,8 +453,6 @@ static struct pci_driver prism2_pci_drv_id = {
 	.suspend	= prism2_pci_suspend,
 	.resume		= prism2_pci_resume,
 #endif /* CONFIG_PM */
-	/* Linux 2.4.6 added save_state and enable_wake that are not used here
-	 */
 };
 
 
diff --git a/drivers/net/wireless/hostap/hostap_plx.c b/drivers/net/wireless/hostap/hostap_plx.c
index 0183df757b3e..040dc3e36410 100644
--- a/drivers/net/wireless/hostap/hostap_plx.c
+++ b/drivers/net/wireless/hostap/hostap_plx.c
@@ -613,9 +613,6 @@ static struct pci_driver prism2_plx_drv_id = {
 	.id_table	= prism2_plx_id_table,
 	.probe		= prism2_plx_probe,
 	.remove		= prism2_plx_remove,
-	.suspend	= NULL,
-	.resume		= NULL,
-	.enable_wake	= NULL
 };
 
 
diff --git a/drivers/net/wireless/prism54/islpci_hotplug.c b/drivers/net/wireless/prism54/islpci_hotplug.c
index 3dcb13bb7d57..25d6c80c9bab 100644
--- a/drivers/net/wireless/prism54/islpci_hotplug.c
+++ b/drivers/net/wireless/prism54/islpci_hotplug.c
@@ -87,7 +87,6 @@ static struct pci_driver prism54_driver = {
 	.remove = prism54_remove,
 	.suspend = prism54_suspend,
 	.resume = prism54_resume,
-	/* .enable_wake ; we don't support this yet */
 };
 
 /******************************************************************************
diff --git a/drivers/scsi/nsp32.c b/drivers/scsi/nsp32.c
index f6f561d26bf0..3e9765f0281d 100644
--- a/drivers/scsi/nsp32.c
+++ b/drivers/scsi/nsp32.c
@@ -3487,15 +3487,6 @@ static int nsp32_resume(struct pci_dev *pdev)
 	return 0;
 }
 
-/* Enable wake event */
-static int nsp32_enable_wake(struct pci_dev *pdev, pci_power_t state, int enable)
-{
-	struct Scsi_Host *host = pci_get_drvdata(pdev);
-
-	nsp32_msg(KERN_INFO, "pci-enable_wake: stub, pdev=0x%p, enable=%d, slot=%s, host=0x%p", pdev, enable, pci_name(pdev), host);
-
-	return 0;
-}
 #endif
 
 /************************************************************************
@@ -3571,7 +3562,6 @@ static struct pci_driver nsp32_driver = {
 #ifdef CONFIG_PM
 	.suspend	= nsp32_suspend, 
 	.resume		= nsp32_resume, 
-	.enable_wake    = nsp32_enable_wake,
 #endif
 };
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4db7b5a18f58..5be420ac6303 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -371,7 +371,6 @@ struct pci_driver {
 	int  (*suspend_late) (struct pci_dev *dev, pm_message_t state);
 	int  (*resume_early) (struct pci_dev *dev);
 	int  (*resume) (struct pci_dev *dev);	                /* Device woken up */
-	int  (*enable_wake) (struct pci_dev *dev, pci_power_t state, int enable);   /* Enable wake event */
 	void (*shutdown) (struct pci_dev *dev);
 
 	struct pci_error_handlers *err_handler;
-- 
cgit v1.3-8-gc7d7


From b8a3a5214d7cc115f1ca3a3967b7229d97c46f4a Mon Sep 17 00:00:00 2001
From: Auke Kok <auke-jan.h.kok@intel.com>
Date: Fri, 8 Jun 2007 15:46:30 -0700
Subject: PCI: read revision ID by default

Currently there are 97 occurrences where drivers need the pci
revision ID. We can do this once for all devices. Even the pci
subsystem needs the revision several times for quirks. The extra
u8 member pads out nicely in the pci_dev struct.

Signed-off-by: Auke Kok <auke-jan.h.kok@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/powerpc/kernel/pci_64.c | 2 ++
 arch/sparc64/kernel/pci.c    | 1 +
 drivers/pci/probe.c          | 1 +
 include/linux/pci.h          | 1 +
 4 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index 96d393c2da02..e3009a43ac56 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -367,8 +367,10 @@ struct pci_dev *of_create_pci_dev(struct device_node *node,
 	sprintf(pci_name(dev), "%04x:%02x:%02x.%d", pci_domain_nr(bus),
 		dev->bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn));
 	dev->class = get_int_prop(node, "class-code", 0);
+	dev->revision = get_int_prop(node, "revision-id", 0);
 
 	DBG("    class: 0x%x\n", dev->class);
+	DBG("    revision: 0x%x\n", dev->revision);
 
 	dev->current_state = 4;		/* unknown power state */
 	dev->error_state = pci_channel_io_normal;
diff --git a/arch/sparc64/kernel/pci.c b/arch/sparc64/kernel/pci.c
index 81f4a5ea05f7..55ad1b899bb8 100644
--- a/arch/sparc64/kernel/pci.c
+++ b/arch/sparc64/kernel/pci.c
@@ -448,6 +448,7 @@ struct pci_dev *of_create_pci_dev(struct pci_pbm_info *pbm,
 		 */
 		pci_read_config_dword(dev, PCI_CLASS_REVISION, &class);
 		dev->class = class >> 8;
+		dev->revision = class & 0xff;
 
 		sprintf(pci_name(dev), "%04x:%02x:%02x.%d", pci_domain_nr(bus),
 			dev->bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn));
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 9cd983acba8c..8802fcb4aaf0 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -702,6 +702,7 @@ static int pci_setup_device(struct pci_dev * dev)
 		dev->bus->number, PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
 
 	pci_read_config_dword(dev, PCI_CLASS_REVISION, &class);
+	dev->revision = class & 0xff;
 	class >>= 8;				    /* upper 3 bytes */
 	dev->class = class;
 	class >>= 8;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 5be420ac6303..45332440a2e6 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -139,6 +139,7 @@ struct pci_dev {
 	unsigned short	subsystem_vendor;
 	unsigned short	subsystem_device;
 	unsigned int	class;		/* 3 bytes: (base,sub,prog-if) */
+	u8		revision;	/* PCI revision, low byte of class word */
 	u8		hdr_type;	/* PCI header type (`multi' flag masked out) */
 	u8		rom_base_reg;	/* which config register controls the ROM */
 	u8		pin;  		/* which interrupt pin this device uses */
-- 
cgit v1.3-8-gc7d7


From 1d0ed384c1f2582b6f7408642c77a78a0c410122 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Mon, 18 Jun 2007 10:58:13 +0200
Subject: PCI: ATM: lanai, change VENDOR to DEVICE

lanai, change VENDOR to DEVICE

There were 2 bad named macros in pci_ids (LANAI 2 and IHB). Rename it to
DEVICE, because it's device id. Also make some cleanpu in pci_device_id
table (use PCI_VDEVICE).

Cc: Mitchell Blank Jr <mitch@sfgoth.com>
Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/atm/lanai.c     | 14 ++++----------
 include/linux/pci_ids.h |  4 ++--
 2 files changed, 6 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/atm/lanai.c b/drivers/atm/lanai.c
index 203c70378b15..0e2c1ae650e7 100644
--- a/drivers/atm/lanai.c
+++ b/drivers/atm/lanai.c
@@ -246,8 +246,8 @@ struct lanai_vcc {
 };
 
 enum lanai_type {
-	lanai2	= PCI_VENDOR_ID_EF_ATM_LANAI2,
-	lanaihb	= PCI_VENDOR_ID_EF_ATM_LANAIHB
+	lanai2	= PCI_DEVICE_ID_EF_ATM_LANAI2,
+	lanaihb	= PCI_DEVICE_ID_EF_ATM_LANAIHB
 };
 
 struct lanai_dev_stats {
@@ -2622,14 +2622,8 @@ static int __devinit lanai_init_one(struct pci_dev *pci,
 }
 
 static struct pci_device_id lanai_pci_tbl[] = {
-	{
-		PCI_VENDOR_ID_EF, PCI_VENDOR_ID_EF_ATM_LANAI2,
-		PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0
-	},
-	{
-		PCI_VENDOR_ID_EF, PCI_VENDOR_ID_EF_ATM_LANAIHB,
-		PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0
-	},
+	{ PCI_VDEVICE(EF, PCI_DEVICE_ID_EF_ATM_LANAI2) },
+	{ PCI_VDEVICE(EF, PCI_DEVICE_ID_EF_ATM_LANAIHB) },
 	{ 0, }	/* terminal entry */
 };
 MODULE_DEVICE_TABLE(pci, lanai_pci_tbl);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 75c4d4d06892..a260a947c917 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1383,8 +1383,8 @@
 #define PCI_VENDOR_ID_EF		0x111a
 #define PCI_DEVICE_ID_EF_ATM_FPGA	0x0000
 #define PCI_DEVICE_ID_EF_ATM_ASIC	0x0002
-#define PCI_VENDOR_ID_EF_ATM_LANAI2	0x0003
-#define PCI_VENDOR_ID_EF_ATM_LANAIHB	0x0005
+#define PCI_DEVICE_ID_EF_ATM_LANAI2	0x0003
+#define PCI_DEVICE_ID_EF_ATM_LANAIHB	0x0005
 
 #define PCI_VENDOR_ID_IDT		0x111d
 #define PCI_DEVICE_ID_IDT_IDT77201	0x0001
-- 
cgit v1.3-8-gc7d7


From c43eaa02abf3b034a9694dcca5c177ecb6072f89 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Mon, 18 Jun 2007 10:58:14 +0200
Subject: PCI: i386: traps, change VENDOR to DEVICE

traps, change VENDOR to DEVICE

Change macro for SGI lithium (arch/i386/mach-visws/traps.c) device from
VENDOR to DEVICE, because it's a device id.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/i386/mach-visws/traps.c | 4 ++--
 include/linux/pci_ids.h      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/mach-visws/traps.c b/arch/i386/mach-visws/traps.c
index 5199bd03254a..843b67acf43b 100644
--- a/arch/i386/mach-visws/traps.c
+++ b/arch/i386/mach-visws/traps.c
@@ -23,13 +23,13 @@ static __init void lithium_init(void)
 	set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
 
 	if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
-	    (li_pcia_read16(PCI_DEVICE_ID) != PCI_VENDOR_ID_SGI_LITHIUM)) {
+	    (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
 		printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
 		panic("This machine is not SGI Visual Workstation 320/540");
 	}
 
 	if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
-	    (li_pcib_read16(PCI_DEVICE_ID) != PCI_VENDOR_ID_SGI_LITHIUM)) {
+	    (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
 		printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
 		panic("This machine is not SGI Visual Workstation 320/540");
 	}
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index a260a947c917..228e0befeda1 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -944,8 +944,8 @@
 
 #define PCI_VENDOR_ID_SGI		0x10a9
 #define PCI_DEVICE_ID_SGI_IOC3		0x0003
+#define PCI_DEVICE_ID_SGI_LITHIUM	0x1002
 #define PCI_DEVICE_ID_SGI_IOC4		0x100a
-#define PCI_VENDOR_ID_SGI_LITHIUM	0x1002
 
 
 #define PCI_VENDOR_ID_WINBOND		0x10ad
-- 
cgit v1.3-8-gc7d7


From 03966e097db1a3b7aff5d364277f2e66069923df Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Mon, 18 Jun 2007 10:55:30 +0200
Subject: PCI: pci_ids, reorder some entries

pci_ids, reorder some entries

Some lines are not vendor sorted, reorder it to comply with the rest of
document.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/pci_ids.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 228e0befeda1..0058fb920c74 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -133,6 +133,9 @@
 
 /* Vendors and devices.  Sort key: vendor first, device next. */
 
+#define PCI_VENDOR_ID_TTTECH		0x0357
+#define PCI_DEVICE_ID_TTTECH_MC322	0x000a
+
 #define PCI_VENDOR_ID_DYNALINK		0x0675
 #define PCI_DEVICE_ID_DYNALINK_IS64PH	0x1702
 
@@ -1902,6 +1905,8 @@
 #define PCI_DEVICE_ID_OXSEMI_16PCI952	0x9521
 #define PCI_DEVICE_ID_OXSEMI_16PCI952PP	0x9523
 
+#define PCI_VENDOR_ID_CHELSIO		0x1425
+
 #define PCI_VENDOR_ID_SAMSUNG		0x144d
 
 #define PCI_VENDOR_ID_MYRICOM		0x14c1
@@ -2010,8 +2015,6 @@
 #define PCI_DEVICE_ID_ENE_720		0x1421
 #define PCI_DEVICE_ID_ENE_722		0x1422
 
-#define PCI_VENDOR_ID_CHELSIO		0x1425
-
 #define PCI_SUBVENDOR_ID_PERLE          0x155f
 #define PCI_SUBDEVICE_ID_PCI_RAS4       0xf001
 #define PCI_SUBDEVICE_ID_PCI_RAS8       0xf010
@@ -2035,6 +2038,9 @@
 #define PCI_DEVICE_ID_MELLANOX_SINAI_OLD 0x5e8c
 #define PCI_DEVICE_ID_MELLANOX_SINAI	0x6274
 
+#define PCI_VENDOR_ID_QUICKNET		0x15e2
+#define PCI_DEVICE_ID_QUICKNET_XJ	0x0500
+
 #define PCI_VENDOR_ID_PDC		0x15e9
 
 
@@ -2412,13 +2418,7 @@
 #define PCI_DEVICE_ID_TIGERJET_300	0x0001
 #define PCI_DEVICE_ID_TIGERJET_100	0x0002
 
-#define PCI_VENDOR_ID_TTTECH		0x0357
-#define PCI_DEVICE_ID_TTTECH_MC322	0x000A
-
 #define PCI_VENDOR_ID_XILINX_RME	0xea60
 #define PCI_DEVICE_ID_RME_DIGI32	0x9896
 #define PCI_DEVICE_ID_RME_DIGI32_PRO	0x9897
 #define PCI_DEVICE_ID_RME_DIGI32_8	0x9898
-
-#define PCI_VENDOR_ID_QUICKNET		0x15E2
-#define PCI_DEVICE_ID_QUICKNET_XJ	0x0500
-- 
cgit v1.3-8-gc7d7


From f732ee0b71365ddc20e6a0b408f9fd1732d7eb75 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Mon, 18 Jun 2007 10:58:14 +0200
Subject: PCI: pci_ids, add atheros and 3com_2 vendors

pci_ids, add atheros and 3com_2 vendors

Atheros is wifi vendor. 3com_2 (0xa727) is an vendor id for one card with
ath chip.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/pci_ids.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 0058fb920c74..0995e97b1ccf 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2059,6 +2059,8 @@
 #define PCI_DEVICE_ID_BCM1250_PCI	0x0001
 #define PCI_DEVICE_ID_BCM1250_HT	0x0002
 
+#define PCI_VENDOR_ID_ATHEROS		0x168c
+
 #define PCI_VENDOR_ID_NETCELL		0x169c
 #define PCI_DEVICE_ID_REVOLUTION	0x0044
 
@@ -2410,6 +2412,8 @@
 #define PCI_DEVICE_ID_NETMOS_9845	0x9845
 #define PCI_DEVICE_ID_NETMOS_9855	0x9855
 
+#define PCI_VENDOR_ID_3COM_2		0xa727
+
 #define PCI_SUBVENDOR_ID_EXSYS		0xd84d
 #define PCI_SUBDEVICE_ID_EXSYS_4014	0x4014
 #define PCI_SUBDEVICE_ID_EXSYS_4055	0x4055
-- 
cgit v1.3-8-gc7d7


From 12bedda9f404c4d34eda6477b0ec32140d83501b Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Mon, 18 Jun 2007 10:56:52 +0200
Subject: PCI: pci_ids, remove double or more empty lines

pci_ids, remove two or more empty lines

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/pci_ids.h | 48 ------------------------------------------------
 1 file changed, 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 0995e97b1ccf..93961e9d0308 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -735,7 +735,6 @@
 #define PCI_DEVICE_ID_ELSA_MICROLINK	0x1000
 #define PCI_DEVICE_ID_ELSA_QS3000	0x3000
 
-
 #define PCI_VENDOR_ID_BUSLOGIC		      0x104B
 #define PCI_DEVICE_ID_BUSLOGIC_MULTIMASTER_NC 0x0140
 #define PCI_DEVICE_ID_BUSLOGIC_MULTIMASTER    0x1040
@@ -781,7 +780,6 @@
 
 #define PCI_VENDOR_ID_SONY		0x104d
 
-
 /* Winbond have two vendor IDs! See 0x10ad as well */
 #define PCI_VENDOR_ID_WINBOND2		0x1050
 #define PCI_DEVICE_ID_WINBOND2_89C940F	0x5a5a
@@ -819,7 +817,6 @@
 #define PCI_DEVICE_ID_PROMISE_20276	0x5275
 #define PCI_DEVICE_ID_PROMISE_20277	0x7275
 
-
 #define PCI_VENDOR_ID_UMC		0x1060
 #define PCI_DEVICE_ID_UMC_UM8673F	0x0101
 #define PCI_DEVICE_ID_UMC_UM8886BF	0x673a
@@ -835,7 +832,6 @@
 #define PCI_DEVICE_ID_MYLEX_DAC960_BA	0xBA56
 #define PCI_DEVICE_ID_MYLEX_DAC960_GEM	0xB166
 
-
 #define PCI_VENDOR_ID_APPLE		0x106b
 #define PCI_DEVICE_ID_APPLE_BANDIT	0x0001
 #define PCI_DEVICE_ID_APPLE_HYDRA	0x000e
@@ -871,7 +867,6 @@
 #define PCI_DEVICE_ID_YAMAHA_744	0x0010
 #define PCI_DEVICE_ID_YAMAHA_754	0x0012
 
-
 #define PCI_VENDOR_ID_QLOGIC		0x1077
 #define PCI_DEVICE_ID_QLOGIC_ISP10160	0x1016
 #define PCI_DEVICE_ID_QLOGIC_ISP1020	0x1020
@@ -902,12 +897,9 @@
 #define PCI_DEVICE_ID_CYRIX_5530_AUDIO	0x0103
 #define PCI_DEVICE_ID_CYRIX_5530_VIDEO	0x0104
 
-
-
 #define PCI_VENDOR_ID_CONTAQ		0x1080
 #define PCI_DEVICE_ID_CONTAQ_82C693	0xc693
 
-
 #define PCI_VENDOR_ID_OLICOM		0x108d
 #define PCI_DEVICE_ID_OLICOM_OC2325	0x0012
 #define PCI_DEVICE_ID_OLICOM_OC2183	0x0013
@@ -939,23 +931,19 @@
 #define PCI_DEVICE_ID_SII_3112		0x3112
 #define PCI_DEVICE_ID_SII_1210SA	0x0240
 
-
 #define PCI_VENDOR_ID_BROOKTREE		0x109e
 #define PCI_DEVICE_ID_BROOKTREE_878	0x0878
 #define PCI_DEVICE_ID_BROOKTREE_879	0x0879
 
-
 #define PCI_VENDOR_ID_SGI		0x10a9
 #define PCI_DEVICE_ID_SGI_IOC3		0x0003
 #define PCI_DEVICE_ID_SGI_LITHIUM	0x1002
 #define PCI_DEVICE_ID_SGI_IOC4		0x100a
 
-
 #define PCI_VENDOR_ID_WINBOND		0x10ad
 #define PCI_DEVICE_ID_WINBOND_82C105	0x0105
 #define PCI_DEVICE_ID_WINBOND_83C553	0x0565
 
-
 #define PCI_VENDOR_ID_PLX		0x10b5
 #define PCI_DEVICE_ID_PLX_R685		0x1030
 #define PCI_DEVICE_ID_PLX_ROMULUS	0x106a
@@ -989,7 +977,6 @@
 #define PCI_DEVICE_ID_3COM_3CR990SVR97	0x9909
 #define PCI_DEVICE_ID_3COM_3CR990SVR	0x990a
 
-
 #define PCI_VENDOR_ID_AL		0x10b9
 #define PCI_DEVICE_ID_AL_M1533		0x1533
 #define PCI_DEVICE_ID_AL_M1535 		0x1535
@@ -1012,18 +999,14 @@
 #define PCI_DEVICE_ID_AL_M5451		0x5451
 #define PCI_DEVICE_ID_AL_M7101		0x7101
 
-
-
 #define PCI_VENDOR_ID_NEOMAGIC		0x10c8
 #define PCI_DEVICE_ID_NEOMAGIC_NM256AV_AUDIO 0x8005
 #define PCI_DEVICE_ID_NEOMAGIC_NM256ZX_AUDIO 0x8006
 #define PCI_DEVICE_ID_NEOMAGIC_NM256XL_PLUS_AUDIO 0x8016
 
-
 #define PCI_VENDOR_ID_TCONRAD		0x10da
 #define PCI_DEVICE_ID_TCONRAD_TOKENRING	0x0508
 
-
 #define PCI_VENDOR_ID_NVIDIA			0x10de
 #define PCI_DEVICE_ID_NVIDIA_TNT		0x0020
 #define PCI_DEVICE_ID_NVIDIA_TNT2		0x0028
@@ -1244,9 +1227,6 @@
 #define PCI_DEVICE_ID_IMS_TT128		0x9128
 #define PCI_DEVICE_ID_IMS_TT3D		0x9135
 
-
-
-
 #define PCI_VENDOR_ID_INTERG		0x10ea
 #define PCI_DEVICE_ID_INTERG_1682	0x1682
 #define PCI_DEVICE_ID_INTERG_2000	0x2000
@@ -1265,7 +1245,6 @@
 #define PCI_DEVICE_ID_XILINX_HAMMERFALL_DSP 0x3fc5
 #define PCI_DEVICE_ID_XILINX_HAMMERFALL_DSP_MADI 0x3fc6
 
-
 #define PCI_VENDOR_ID_INIT		0x1101
 
 #define PCI_VENDOR_ID_CREATIVE		0x1102 /* duplicate: ECTIVA */
@@ -1360,7 +1339,6 @@
 #define PCI_VENDOR_ID_SIEMENS           0x110A
 #define PCI_DEVICE_ID_SIEMENS_DSCC4     0x2102
 
-
 #define PCI_VENDOR_ID_VORTEX		0x1119
 #define PCI_DEVICE_ID_VORTEX_GDT60x0	0x0000
 #define PCI_DEVICE_ID_VORTEX_GDT6000B	0x0001
@@ -1395,7 +1373,6 @@
 #define PCI_VENDOR_ID_FORE		0x1127
 #define PCI_DEVICE_ID_FORE_PCA200E	0x0300
 
-
 #define PCI_VENDOR_ID_PHILIPS		0x1131
 #define PCI_DEVICE_ID_PHILIPS_SAA7146	0x7146
 #define PCI_DEVICE_ID_PHILIPS_SAA9730	0x9730
@@ -1414,7 +1391,6 @@
 #define PCI_DEVICE_ID_ZIATECH_5550_HC	0x5550
  
 
-
 #define PCI_VENDOR_ID_SYSKONNECT	0x1148
 #define PCI_DEVICE_ID_SYSKONNECT_TR	0x4200
 #define PCI_DEVICE_ID_SYSKONNECT_GE	0x4300
@@ -1422,7 +1398,6 @@
 #define PCI_DEVICE_ID_SYSKONNECT_9DXX	0x4400
 #define PCI_DEVICE_ID_SYSKONNECT_9MXX	0x4500
 
-
 #define PCI_VENDOR_ID_DIGI		0x114f
 #define PCI_DEVICE_ID_DIGI_DF_M_IOM2_E	0x0070
 #define PCI_DEVICE_ID_DIGI_DF_M_E	0x0071
@@ -1433,12 +1408,10 @@
 #define PCI_DEVICE_ID_NEO_2RJ45         0x00CA
 #define PCI_DEVICE_ID_NEO_2RJ45PRI      0x00CB
 
-
 #define PCI_VENDOR_ID_XIRCOM		0x115d
 #define PCI_DEVICE_ID_XIRCOM_RBM56G	0x0101
 #define PCI_DEVICE_ID_XIRCOM_X3201_MDM	0x0103
 
-
 #define PCI_VENDOR_ID_SERVERWORKS	  0x1166
 #define PCI_DEVICE_ID_SERVERWORKS_HE	  0x0008
 #define PCI_DEVICE_ID_SERVERWORKS_LE	  0x0009
@@ -1507,7 +1480,6 @@
 #define PCI_DEVICE_ID_ZEITNET_1221	0x0001
 #define PCI_DEVICE_ID_ZEITNET_1225	0x0002
 
-
 #define PCI_VENDOR_ID_FUJITSU_ME	0x119e
 #define PCI_DEVICE_ID_FUJITSU_FS155	0x0001
 #define PCI_DEVICE_ID_FUJITSU_FS50	0x0003
@@ -1525,28 +1497,23 @@
 #define PCI_DEVICE_ID_V3_V960		0x0001
 #define PCI_DEVICE_ID_V3_V351		0x0002
 
-
 #define PCI_VENDOR_ID_ATT		0x11c1
 #define PCI_DEVICE_ID_ATT_VENUS_MODEM	0x480
 
-
 #define PCI_VENDOR_ID_SPECIALIX		0x11cb
 #define PCI_DEVICE_ID_SPECIALIX_IO8	0x2000
 #define PCI_DEVICE_ID_SPECIALIX_RIO	0x8000
 #define PCI_SUBDEVICE_ID_SPECIALIX_SPEED4 0xa004
 
-
 #define PCI_VENDOR_ID_ANALOG_DEVICES	0x11d4
 #define PCI_DEVICE_ID_AD1889JS		0x1889
 
-
 #define PCI_DEVICE_ID_SEGA_BBA		0x1234
 
 #define PCI_VENDOR_ID_ZORAN		0x11de
 #define PCI_DEVICE_ID_ZORAN_36057	0x6057
 #define PCI_DEVICE_ID_ZORAN_36120	0x6120
 
-
 #define PCI_VENDOR_ID_COMPEX		0x11f6
 #define PCI_DEVICE_ID_COMPEX_ENET100VG4	0x0112
 
@@ -1605,8 +1572,6 @@
 #define PCI_DEVICE_ID_3DFX_VOODOO3	0x0005
 #define PCI_DEVICE_ID_3DFX_VOODOO5	0x0009
 
-
-
 #define PCI_VENDOR_ID_AVM		0x1244
 #define PCI_DEVICE_ID_AVM_B1		0x0700
 #define PCI_DEVICE_ID_AVM_C4		0x0800
@@ -1615,7 +1580,6 @@
 #define PCI_DEVICE_ID_AVM_C2		0x1100
 #define PCI_DEVICE_ID_AVM_T1		0x1200
 
-
 #define PCI_VENDOR_ID_STALLION		0x124d
 
 /* Allied Telesyn */
@@ -1638,7 +1602,6 @@
 #define PCI_VENDOR_ID_SATSAGEM		0x1267
 #define PCI_DEVICE_ID_SATSAGEM_NICCY	0x1016
 
-
 #define PCI_VENDOR_ID_ENSONIQ		0x1274
 #define PCI_DEVICE_ID_ENSONIQ_CT5880	0x5880
 #define PCI_DEVICE_ID_ENSONIQ_ES1370	0x5000
@@ -1661,7 +1624,6 @@
 
 #define PCI_VENDOR_ID_ALTEON		0x12ae
 
-
 #define PCI_SUBVENDOR_ID_CONNECT_TECH			0x12c4
 #define PCI_SUBDEVICE_ID_CONNECT_TECH_BH8_232		0x0001
 #define PCI_SUBDEVICE_ID_CONNECT_TECH_BH4_232		0x0002
@@ -1692,7 +1654,6 @@
 #define PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_4_485	0x0331
 #define PCI_SUBDEVICE_ID_CONNECT_TECH_PCI_UART_8_485	0x0332
 
-
 #define PCI_VENDOR_ID_NVIDIA_SGS	0x12d2
 #define PCI_DEVICE_ID_NVIDIA_SGS_RIVA128 0x0018
 
@@ -1802,7 +1763,6 @@
 #define PCI_DEVICE_ID_LMC_SSI		0x0005
 #define PCI_DEVICE_ID_LMC_T1		0x0006
 
-
 #define PCI_VENDOR_ID_NETGEAR		0x1385
 #define PCI_DEVICE_ID_NETGEAR_GA620	0x620a
 
@@ -2019,7 +1979,6 @@
 #define PCI_SUBDEVICE_ID_PCI_RAS4       0xf001
 #define PCI_SUBDEVICE_ID_PCI_RAS8       0xf010
 
-
 #define PCI_VENDOR_ID_SYBA		0x1592
 #define PCI_DEVICE_ID_SYBA_2P_EPP	0x0782
 #define PCI_DEVICE_ID_SYBA_1P_ECP	0x0783
@@ -2043,7 +2002,6 @@
 
 #define PCI_VENDOR_ID_PDC		0x15e9
 
-
 #define PCI_VENDOR_ID_FARSITE           0x1619
 #define PCI_DEVICE_ID_FARSITE_T2P       0x0400
 #define PCI_DEVICE_ID_FARSITE_T4P       0x0440
@@ -2099,7 +2057,6 @@
 #define PCI_DEVICE_ID_HERC_WIN		0x5732
 #define PCI_DEVICE_ID_HERC_UNI		0x5832
 
-
 #define PCI_VENDOR_ID_SITECOM		0x182d
 #define PCI_DEVICE_ID_SITECOM_DC105V2	0x3069
 
@@ -2135,12 +2092,9 @@
 #define PCI_DEVICE_ID_3DLABS_PERMEDIA2	0x0007
 #define PCI_DEVICE_ID_3DLABS_PERMEDIA2V	0x0009
 
-
 #define PCI_VENDOR_ID_AKS		0x416c
 #define PCI_DEVICE_ID_AKS_ALADDINCARD	0x0100
 
-
-
 #define PCI_VENDOR_ID_S3		0x5333
 #define PCI_DEVICE_ID_S3_TRIO		0x8811
 #define PCI_DEVICE_ID_S3_868		0x8880
@@ -2152,7 +2106,6 @@
 #define PCI_VENDOR_ID_DUNORD		0x5544
 #define PCI_DEVICE_ID_DUNORD_I3000	0x0001
 
-
 #define PCI_VENDOR_ID_DCI		0x6666
 #define PCI_DEVICE_ID_DCI_PCCOM4	0x0001
 #define PCI_DEVICE_ID_DCI_PCCOM8	0x0002
@@ -2396,7 +2349,6 @@
 #define PCI_DEVICE_ID_ADAPTEC2_OBSIDIAN   0x0500
 #define PCI_DEVICE_ID_ADAPTEC2_SCAMP	0x0503
 
-
 #define PCI_VENDOR_ID_HOLTEK		0x9412
 #define PCI_DEVICE_ID_HOLTEK_6565	0x6565
 
-- 
cgit v1.3-8-gc7d7


From 579082df38839efc5b14aa3f48b8806e3e8dc5c2 Mon Sep 17 00:00:00 2001
From: Rolf Eike Beer <eike-kernel@sf-tec.de>
Date: Tue, 10 Jul 2007 13:35:05 +0200
Subject: PCI: Fix typo in include/linux/pci.h

Signed-off-by: Rolf Eike Beer <eike-kernel@sf-tec.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/pci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 45332440a2e6..a6657b7f245d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -315,7 +315,7 @@ struct pci_dynids {
 
 /* ---------------------------------------------------------------- */
 /** PCI Error Recovery System (PCI-ERS).  If a PCI device driver provides
- *  a set fof callbacks in struct pci_error_handlers, then that device driver
+ *  a set of callbacks in struct pci_error_handlers, then that device driver
  *  will be notified of PCI bus errors, and will be driven to recovery
  *  when an error occurs.
  */
-- 
cgit v1.3-8-gc7d7


From 694625c0b322905d6892fad873029f764cd4823f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 9 Jul 2007 11:55:54 -0700
Subject: PCI: add pci_try_set_mwi

As suggested by Andrew, add pci_try_set_mwi(), which does not require
return-value checking.

- add pci_try_set_mwi() without __must_check
- make it return 0 on success, errno if the "try" failed or error
- review callers

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/pci.txt                         |  5 ++++-
 drivers/ata/pata_cs5530.c                     |  2 +-
 drivers/ide/pci/cs5530.c                      |  2 +-
 drivers/net/cassini.c                         |  4 ++--
 drivers/net/starfire.c                        |  2 +-
 drivers/net/tulip/tulip_core.c                |  2 +-
 drivers/net/wireless/prism54/islpci_hotplug.c |  3 +--
 drivers/pci/pci.c                             | 28 +++++++++++++++++++++++----
 drivers/scsi/lpfc/lpfc_init.c                 |  5 +----
 drivers/usb/gadget/net2280.c                  |  2 +-
 include/linux/pci.h                           |  1 +
 11 files changed, 38 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/pci.txt b/Documentation/pci.txt
index 7d3da30ff0b4..7754f5aea4e9 100644
--- a/Documentation/pci.txt
+++ b/Documentation/pci.txt
@@ -296,7 +296,10 @@ If the PCI device can use the PCI Memory-Write-Invalidate transaction,
 call pci_set_mwi().  This enables the PCI_COMMAND bit for Mem-Wr-Inval
 and also ensures that the cache line size register is set correctly.
 Check the return value of pci_set_mwi() as not all architectures
-or chip-sets may support Memory-Write-Invalidate.
+or chip-sets may support Memory-Write-Invalidate.  Alternatively,
+if Mem-Wr-Inval would be nice to have but is not required, call
+pci_try_set_mwi() to have the system do its best effort at enabling
+Mem-Wr-Inval.
 
 
 3.2 Request MMIO/IOP resources
diff --git a/drivers/ata/pata_cs5530.c b/drivers/ata/pata_cs5530.c
index 3fca5898642b..68f150a1e2f4 100644
--- a/drivers/ata/pata_cs5530.c
+++ b/drivers/ata/pata_cs5530.c
@@ -266,7 +266,7 @@ static int cs5530_init_chip(void)
 	}
 
 	pci_set_master(cs5530_0);
-	pci_set_mwi(cs5530_0);
+	pci_try_set_mwi(cs5530_0);
 
 	/*
 	 * Set PCI CacheLineSize to 16-bytes:
diff --git a/drivers/ide/pci/cs5530.c b/drivers/ide/pci/cs5530.c
index 1eec1f308d16..b5c00d15a704 100644
--- a/drivers/ide/pci/cs5530.c
+++ b/drivers/ide/pci/cs5530.c
@@ -236,7 +236,7 @@ static unsigned int __devinit init_chipset_cs5530 (struct pci_dev *dev, const ch
 	 */
 
 	pci_set_master(cs5530_0);
-	pci_set_mwi(cs5530_0);
+	pci_try_set_mwi(cs5530_0);
 
 	/*
 	 * Set PCI CacheLineSize to 16-bytes:
diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c
index 805924fc077a..f6e4030c73d1 100644
--- a/drivers/net/cassini.c
+++ b/drivers/net/cassini.c
@@ -4917,13 +4917,13 @@ static int __devinit cas_init_one(struct pci_dev *pdev,
 	pci_cmd &= ~PCI_COMMAND_SERR;
 	pci_cmd |= PCI_COMMAND_PARITY;
 	pci_write_config_word(pdev, PCI_COMMAND, pci_cmd);
-	if (pci_set_mwi(pdev))
+	if (pci_try_set_mwi(pdev))
 		printk(KERN_WARNING PFX "Could not enable MWI for %s\n",
 		       pci_name(pdev));
 
 	/*
 	 * On some architectures, the default cache line size set
-	 * by pci_set_mwi reduces perforamnce.  We have to increase
+	 * by pci_try_set_mwi reduces perforamnce.  We have to increase
 	 * it for this case.  To start, we'll print some configuration
 	 * data.
 	 */
diff --git a/drivers/net/starfire.c b/drivers/net/starfire.c
index 786d4b9c07ec..2f69d5c5dfa6 100644
--- a/drivers/net/starfire.c
+++ b/drivers/net/starfire.c
@@ -740,7 +740,7 @@ static int __devinit starfire_init_one(struct pci_dev *pdev,
 	pci_set_master(pdev);
 
 	/* enable MWI -- it vastly improves Rx performance on sparc64 */
-	pci_set_mwi(pdev);
+	pci_try_set_mwi(pdev);
 
 #ifdef ZEROCOPY
 	/* Starfire can do TCP/UDP checksumming */
diff --git a/drivers/net/tulip/tulip_core.c b/drivers/net/tulip/tulip_core.c
index 1a9e911b86a2..7dcd138b0fed 100644
--- a/drivers/net/tulip/tulip_core.c
+++ b/drivers/net/tulip/tulip_core.c
@@ -1155,7 +1155,7 @@ static void __devinit tulip_mwi_config (struct pci_dev *pdev,
 	/* set or disable MWI in the standard PCI command bit.
 	 * Check for the case where  mwi is desired but not available
 	 */
-	if (csr0 & MWI)	pci_set_mwi(pdev);
+	if (csr0 & MWI)	pci_try_set_mwi(pdev);
 	else		pci_clear_mwi(pdev);
 
 	/* read result from hardware (in case bit refused to enable) */
diff --git a/drivers/net/wireless/prism54/islpci_hotplug.c b/drivers/net/wireless/prism54/islpci_hotplug.c
index 25d6c80c9bab..af2e4f2405f2 100644
--- a/drivers/net/wireless/prism54/islpci_hotplug.c
+++ b/drivers/net/wireless/prism54/islpci_hotplug.c
@@ -166,8 +166,7 @@ prism54_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	pci_set_master(pdev);
 
 	/* enable MWI */
-	if (!pci_set_mwi(pdev))
-		printk(KERN_INFO "%s: pci_set_mwi(pdev) succeeded\n", DRV_NAME);
+	pci_try_set_mwi(pdev);
 
 	/* setup the network device interface and its structure */
 	if (!(ndev = islpci_setup(pdev))) {
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index d9fc1bd1f3e8..35fa30aa3065 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1186,6 +1186,11 @@ int pci_set_mwi(struct pci_dev *dev)
 	return 0;
 }
 
+int pci_try_set_mwi(struct pci_dev *dev)
+{
+	return 0;
+}
+
 void pci_clear_mwi(struct pci_dev *dev)
 {
 }
@@ -1242,9 +1247,7 @@ pci_set_cacheline_size(struct pci_dev *dev)
  * pci_set_mwi - enables memory-write-invalidate PCI transaction
  * @dev: the PCI device for which MWI is enabled
  *
- * Enables the Memory-Write-Invalidate transaction in %PCI_COMMAND,
- * and then calls @pcibios_set_mwi to do the needed arch specific
- * operations or a generic mwi-prep function.
+ * Enables the Memory-Write-Invalidate transaction in %PCI_COMMAND.
  *
  * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
  */
@@ -1260,7 +1263,8 @@ pci_set_mwi(struct pci_dev *dev)
 
 	pci_read_config_word(dev, PCI_COMMAND, &cmd);
 	if (! (cmd & PCI_COMMAND_INVALIDATE)) {
-		pr_debug("PCI: Enabling Mem-Wr-Inval for device %s\n", pci_name(dev));
+		pr_debug("PCI: Enabling Mem-Wr-Inval for device %s\n",
+			pci_name(dev));
 		cmd |= PCI_COMMAND_INVALIDATE;
 		pci_write_config_word(dev, PCI_COMMAND, cmd);
 	}
@@ -1268,6 +1272,21 @@ pci_set_mwi(struct pci_dev *dev)
 	return 0;
 }
 
+/**
+ * pci_try_set_mwi - enables memory-write-invalidate PCI transaction
+ * @dev: the PCI device for which MWI is enabled
+ *
+ * Enables the Memory-Write-Invalidate transaction in %PCI_COMMAND.
+ * Callers are not required to check the return value.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int pci_try_set_mwi(struct pci_dev *dev)
+{
+	int rc = pci_set_mwi(dev);
+	return rc;
+}
+
 /**
  * pci_clear_mwi - disables Memory-Write-Invalidate for device dev
  * @dev: the PCI device to disable
@@ -1600,6 +1619,7 @@ EXPORT_SYMBOL(pci_release_selected_regions);
 EXPORT_SYMBOL(pci_request_selected_regions);
 EXPORT_SYMBOL(pci_set_master);
 EXPORT_SYMBOL(pci_set_mwi);
+EXPORT_SYMBOL(pci_try_set_mwi);
 EXPORT_SYMBOL(pci_clear_mwi);
 EXPORT_SYMBOL_GPL(pci_intx);
 EXPORT_SYMBOL(pci_set_dma_mask);
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index dcb4ba0ecee1..955b2e48d041 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -1578,10 +1578,7 @@ lpfc_pci_probe_one(struct pci_dev *pdev, const struct pci_device_id *pid)
 	INIT_LIST_HEAD(&phba->fc_nodes);
 
 	pci_set_master(pdev);
-	retval = pci_set_mwi(pdev);
-	if (retval)
-		dev_printk(KERN_WARNING, &pdev->dev,
-			   "Warning: pci_set_mwi returned %d\n", retval);
+	pci_try_set_mwi(pdev);
 
 	if (pci_set_dma_mask(phba->pcidev, DMA_64BIT_MASK) != 0)
 		if (pci_set_dma_mask(phba->pcidev, DMA_32BIT_MASK) != 0)
diff --git a/drivers/usb/gadget/net2280.c b/drivers/usb/gadget/net2280.c
index d975ecf18e00..00fda334dc72 100644
--- a/drivers/usb/gadget/net2280.c
+++ b/drivers/usb/gadget/net2280.c
@@ -2964,7 +2964,7 @@ static int net2280_probe (struct pci_dev *pdev, const struct pci_device_id *id)
 			, &dev->pci->pcimstctl);
 	/* erratum 0115 shouldn't appear: Linux inits PCI_LATENCY_TIMER */
 	pci_set_master (pdev);
-	pci_set_mwi (pdev);
+	pci_try_set_mwi (pdev);
 
 	/* ... also flushes any posted pci writes */
 	dev->chiprev = get_idx_reg (dev->regs, REG_CHIPREV) & 0xffff;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index a6657b7f245d..a5602e26f4dd 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -545,6 +545,7 @@ void pci_set_master(struct pci_dev *dev);
 int pci_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state);
 #define HAVE_PCI_SET_MWI
 int __must_check pci_set_mwi(struct pci_dev *dev);
+int pci_try_set_mwi(struct pci_dev *dev);
 void pci_clear_mwi(struct pci_dev *dev);
 void pci_intx(struct pci_dev *dev, int enable);
 void pci_msi_off(struct pci_dev *dev);
-- 
cgit v1.3-8-gc7d7


From cfc94cdf8e0f14e692a5a40ef3cc10f464b2511b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 9 May 2007 13:19:52 +0200
Subject: debugfs: add rename for debugfs files

Implement debugfs_rename() to allow renaming files/directories in debugfs.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/debugfs/inode.c      | 63 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/debugfs.h |  9 +++++++
 2 files changed, 72 insertions(+)

(limited to 'include/linux')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index ec8896b264de..1d533a2ec3a6 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -368,6 +368,69 @@ void debugfs_remove(struct dentry *dentry)
 }
 EXPORT_SYMBOL_GPL(debugfs_remove);
 
+/**
+ * debugfs_rename - rename a file/directory in the debugfs filesystem
+ * @old_dir: a pointer to the parent dentry for the renamed object. This
+ *          should be a directory dentry.
+ * @old_dentry: dentry of an object to be renamed.
+ * @new_dir: a pointer to the parent dentry where the object should be
+ *          moved. This should be a directory dentry.
+ * @new_name: a pointer to a string containing the target name.
+ *
+ * This function renames a file/directory in debugfs.  The target must not
+ * exist for rename to succeed.
+ *
+ * This function will return a pointer to old_dentry (which is updated to
+ * reflect renaming) if it succeeds. If an error occurs, %NULL will be
+ * returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
+		struct dentry *new_dir, const char *new_name)
+{
+	int error;
+	struct dentry *dentry = NULL, *trap;
+	const char *old_name;
+
+	trap = lock_rename(new_dir, old_dir);
+	/* Source or destination directories don't exist? */
+	if (!old_dir->d_inode || !new_dir->d_inode)
+		goto exit;
+	/* Source does not exist, cyclic rename, or mountpoint? */
+	if (!old_dentry->d_inode || old_dentry == trap ||
+	    d_mountpoint(old_dentry))
+		goto exit;
+	dentry = lookup_one_len(new_name, new_dir, strlen(new_name));
+	/* Lookup failed, cyclic rename or target exists? */
+	if (IS_ERR(dentry) || dentry == trap || dentry->d_inode)
+		goto exit;
+
+	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
+
+	error = simple_rename(old_dir->d_inode, old_dentry, new_dir->d_inode,
+		dentry);
+	if (error) {
+		fsnotify_oldname_free(old_name);
+		goto exit;
+	}
+	d_move(old_dentry, dentry);
+	fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name,
+		old_dentry->d_name.name, S_ISDIR(old_dentry->d_inode->i_mode),
+		NULL, old_dentry->d_inode);
+	fsnotify_oldname_free(old_name);
+	unlock_rename(new_dir, old_dir);
+	dput(dentry);
+	return old_dentry;
+exit:
+	if (dentry && !IS_ERR(dentry))
+		dput(dentry);
+	unlock_rename(new_dir, old_dir);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(debugfs_rename);
+
 static decl_subsys(debug, NULL, NULL);
 
 static int __init debugfs_init(void)
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 5a9c49534d08..104e51e20e14 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -38,6 +38,9 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 
 void debugfs_remove(struct dentry *dentry);
 
+struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
+                struct dentry *new_dir, const char *new_name);
+
 struct dentry *debugfs_create_u8(const char *name, mode_t mode,
 				 struct dentry *parent, u8 *value);
 struct dentry *debugfs_create_u16(const char *name, mode_t mode,
@@ -85,6 +88,12 @@ static inline struct dentry *debugfs_create_symlink(const char *name,
 static inline void debugfs_remove(struct dentry *dentry)
 { }
 
+static inline struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
+                struct dentry *new_dir, char *new_name)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline struct dentry *debugfs_create_u8(const char *name, mode_t mode,
 					       struct dentry *parent,
 					       u8 *value)
-- 
cgit v1.3-8-gc7d7


From 4f5c791a850e5305a5b1b48d0e4b4de248dc96f9 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <mzxreary@0pointer.de>
Date: Tue, 8 May 2007 22:07:02 +0200
Subject: DMI-based module autoloading

The patch below adds DMI/SMBIOS based module autoloading to the Linux
kernel. The idea is to load laptop drivers automatically (and other
drivers which cannot be autoloaded otherwise), based on the DMI system
identification information of the BIOS.

Right now most distros manually try to load all available laptop
drivers on bootup in the hope that at least one of them loads
successfully. This patch does away with all that, and uses udev to
automatically load matching drivers on the right machines.

Basically the patch just exports the DMI information that has been
parsed by the kernel anyway to userspace via a sysfs device
/sys/class/dmi/id and makes sure that proper modalias attributes are
available. Besides adding the "modalias" attribute it also adds
attributes for a few other DMI fields which might be useful for
writing udev rules.

This patch is not an attempt to export the entire DMI/SMBIOS data to
userspace. We already have "dmidecode" which parses the complete DMI
info from userspace. The purpose of this patch is machine model
identification and good udev integration.

To take advantage of DMI based module autoloading, a driver should
export one or more MODULE_ALIAS fields similar to these:

MODULE_ALIAS("dmi:*:svnMICRO-STARINT'LCO.,LTD:pnMS-1013:pvr0131*:cvnMICRO-STARINT'LCO.,LTD:ct10:*");
MODULE_ALIAS("dmi:*:svnMicro-StarInternational:pnMS-1058:pvr0581:rvnMSI:rnMS-1058:*:ct10:*");
MODULE_ALIAS("dmi:*:svnMicro-StarInternational:pnMS-1412:*:rvnMSI:rnMS-1412:*:cvnMICRO-STARINT'LCO.,LTD:ct10:*");
MODULE_ALIAS("dmi:*:svnNOTEBOOK:pnSAM2000:pvr0131*:cvnMICRO-STARINT'LCO.,LTD:ct10:*");

These lines are specific to my msi-laptop.c driver. They are basically
just a concatenation of a few carefully selected DMI fields with all
potentially bad characters stripped.

Besides laptop drivers, modules like "hdaps", the i2c modules
and the hwmon modules are good candidates for "dmi:" MODULE_ALIAS
lines.

Besides merely exporting the DMI data via sysfs the patch adds
support for a few more DMI fields. Especially the CHASSIS fields are
very useful to identify different laptop modules. The patch also adds
working MODULE_ALIAS lines to my msi-laptop.c driver.

I'd like to thank Kay Sievers for helping me to clean up this patch
for posting it on lkml.

Patch is against Linus' current GIT HEAD. Should probably apply to
older kernels as well without modification.


Signed-off-by: Lennart Poettering <mzxreary@0pointer.de>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/firmware/Kconfig    |   9 ++
 drivers/firmware/Makefile   |   1 +
 drivers/firmware/dmi-id.c   | 222 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/firmware/dmi_scan.c |  73 +++++++++++++--
 drivers/misc/msi-laptop.c   |  44 ++++++++-
 include/linux/dmi.h         |   8 ++
 6 files changed, 348 insertions(+), 9 deletions(-)
 create mode 100644 drivers/firmware/dmi-id.c

(limited to 'include/linux')

diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 88f462122a30..05f02a326f1c 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -84,4 +84,13 @@ config DCDBAS
 	  Say Y or M here to enable the driver for use by Dell systems
 	  management software such as Dell OpenManage.
 
+config DMIID
+    bool "Export DMI identification via sysfs to userspace"
+    depends on DMI
+    default y
+	help
+	  Say Y here if you want to query SMBIOS/DMI system identification
+	  information from userspace through /sys/class/dmi/id/ or if you want
+	  DMI-based module auto-loading.
+
 endmenu
diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile
index 98e395f4bb29..8d4ebc805a50 100644
--- a/drivers/firmware/Makefile
+++ b/drivers/firmware/Makefile
@@ -7,3 +7,4 @@ obj-$(CONFIG_EFI_VARS)		+= efivars.o
 obj-$(CONFIG_EFI_PCDP)		+= pcdp.o
 obj-$(CONFIG_DELL_RBU)          += dell_rbu.o
 obj-$(CONFIG_DCDBAS)		+= dcdbas.o
+obj-$(CONFIG_DMIID)		+= dmi-id.o
diff --git a/drivers/firmware/dmi-id.c b/drivers/firmware/dmi-id.c
new file mode 100644
index 000000000000..59c3b5aa89f4
--- /dev/null
+++ b/drivers/firmware/dmi-id.c
@@ -0,0 +1,222 @@
+/*
+ * Export SMBIOS/DMI info via sysfs to userspace
+ *
+ * Copyright 2007, Lennart Poettering
+ *
+ * Licensed under GPLv2
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+#include <linux/device.h>
+#include <linux/autoconf.h>
+
+#define DEFINE_DMI_ATTR(_name, _mode, _show)		\
+static struct device_attribute sys_dmi_##_name##_attr =	\
+	__ATTR(_name, _mode, _show, NULL);
+
+#define DEFINE_DMI_ATTR_WITH_SHOW(_name, _mode, _field)			\
+static ssize_t sys_dmi_##_name##_show(struct device *dev,		\
+				      struct device_attribute *attr,	\
+				      char *page)			\
+{									\
+	ssize_t len;							\
+	len = scnprintf(page, PAGE_SIZE, "%s\n", dmi_get_system_info(_field)); \
+	page[len-1] = '\n';						\
+	return len;							\
+}									\
+DEFINE_DMI_ATTR(_name, _mode, sys_dmi_##_name##_show);
+
+DEFINE_DMI_ATTR_WITH_SHOW(bios_vendor,		0444, DMI_BIOS_VENDOR);
+DEFINE_DMI_ATTR_WITH_SHOW(bios_version,		0444, DMI_BIOS_VERSION);
+DEFINE_DMI_ATTR_WITH_SHOW(bios_date,		0444, DMI_BIOS_DATE);
+DEFINE_DMI_ATTR_WITH_SHOW(sys_vendor,		0444, DMI_SYS_VENDOR);
+DEFINE_DMI_ATTR_WITH_SHOW(product_name,		0444, DMI_PRODUCT_NAME);
+DEFINE_DMI_ATTR_WITH_SHOW(product_version,	0444, DMI_PRODUCT_VERSION);
+DEFINE_DMI_ATTR_WITH_SHOW(product_serial,	0400, DMI_PRODUCT_SERIAL);
+DEFINE_DMI_ATTR_WITH_SHOW(product_uuid,		0400, DMI_PRODUCT_UUID);
+DEFINE_DMI_ATTR_WITH_SHOW(board_vendor,		0444, DMI_BOARD_VENDOR);
+DEFINE_DMI_ATTR_WITH_SHOW(board_name,		0444, DMI_BOARD_NAME);
+DEFINE_DMI_ATTR_WITH_SHOW(board_version,	0444, DMI_BOARD_VERSION);
+DEFINE_DMI_ATTR_WITH_SHOW(board_serial,		0400, DMI_BOARD_SERIAL);
+DEFINE_DMI_ATTR_WITH_SHOW(board_asset_tag,	0444, DMI_BOARD_ASSET_TAG);
+DEFINE_DMI_ATTR_WITH_SHOW(chassis_vendor,	0444, DMI_CHASSIS_VENDOR);
+DEFINE_DMI_ATTR_WITH_SHOW(chassis_type,		0444, DMI_CHASSIS_TYPE);
+DEFINE_DMI_ATTR_WITH_SHOW(chassis_version,	0444, DMI_CHASSIS_VERSION);
+DEFINE_DMI_ATTR_WITH_SHOW(chassis_serial,	0400, DMI_CHASSIS_SERIAL);
+DEFINE_DMI_ATTR_WITH_SHOW(chassis_asset_tag,	0444, DMI_CHASSIS_ASSET_TAG);
+
+static void ascii_filter(char *d, const char *s)
+{
+	/* Filter out characters we don't want to see in the modalias string */
+	for (; *s; s++)
+		if (*s > ' ' && *s < 127 && *s != ':')
+			*(d++) = *s;
+
+	*d = 0;
+}
+
+static ssize_t get_modalias(char *buffer, size_t buffer_size)
+{
+	static const struct mafield {
+		const char *prefix;
+		int field;
+	} fields[] = {
+		{ "bvn", DMI_BIOS_VENDOR },
+		{ "bvr", DMI_BIOS_VERSION },
+		{ "bd",  DMI_BIOS_DATE },
+		{ "svn", DMI_SYS_VENDOR },
+		{ "pn",  DMI_PRODUCT_NAME },
+		{ "pvr", DMI_PRODUCT_VERSION },
+		{ "rvn", DMI_BOARD_VENDOR },
+		{ "rn",  DMI_BOARD_NAME },
+		{ "rvr", DMI_BOARD_VERSION },
+		{ "cvn", DMI_CHASSIS_VENDOR },
+		{ "ct",  DMI_CHASSIS_TYPE },
+		{ "cvr", DMI_CHASSIS_VERSION },
+		{ NULL,  DMI_NONE }
+	};
+
+	ssize_t l, left;
+	char *p;
+	const struct mafield *f;
+
+	strcpy(buffer, "dmi");
+	p = buffer + 3; left = buffer_size - 4;
+
+	for (f = fields; f->prefix && left > 0; f++) {
+		const char *c;
+		char *t;
+
+		c = dmi_get_system_info(f->field);
+		if (!c)
+			continue;
+
+		t = kmalloc(strlen(c) + 1, GFP_KERNEL);
+		if (!t)
+			break;
+		ascii_filter(t, c);
+		l = scnprintf(p, left, ":%s%s", f->prefix, t);
+		kfree(t);
+
+		p += l;
+		left -= l;
+	}
+
+	p[0] = ':';
+	p[1] = 0;
+
+	return p - buffer + 1;
+}
+
+static ssize_t sys_dmi_modalias_show(struct device *dev,
+				     struct device_attribute *attr, char *page)
+{
+	ssize_t r;
+	r = get_modalias(page, PAGE_SIZE-1);
+	page[r] = '\n';
+	page[r+1] = 0;
+	return r+1;
+}
+
+DEFINE_DMI_ATTR(modalias, 0444, sys_dmi_modalias_show);
+
+static struct attribute *sys_dmi_attributes[DMI_STRING_MAX+2];
+
+static struct attribute_group sys_dmi_attribute_group = {
+	.attrs = sys_dmi_attributes,
+};
+
+static struct attribute_group* sys_dmi_attribute_groups[] = {
+	&sys_dmi_attribute_group,
+	NULL
+};
+
+static int dmi_dev_uevent(struct device *dev, char **envp,
+			    int num_envp, char *buffer, int buffer_size)
+{
+	strcpy(buffer, "MODALIAS=");
+	get_modalias(buffer+9, buffer_size-9);
+	envp[0] = buffer;
+	envp[1] = NULL;
+
+	return 0;
+}
+
+static struct class dmi_class = {
+	.name = "dmi",
+	.dev_release = (void(*)(struct device *)) kfree,
+	.dev_uevent = dmi_dev_uevent,
+};
+
+static struct device *dmi_dev;
+
+/* Initialization */
+
+#define ADD_DMI_ATTR(_name, _field) \
+	if (dmi_get_system_info(_field)) \
+		sys_dmi_attributes[i++] = & sys_dmi_##_name##_attr.attr;
+
+extern int dmi_available;
+
+static int __init dmi_id_init(void)
+{
+	int ret, i;
+
+	if (!dmi_available)
+		return -ENODEV;
+
+	/* Not necessarily all DMI fields are available on all
+	 * systems, hence let's built an attribute table of just
+	 * what's available */
+	i = 0;
+	ADD_DMI_ATTR(bios_vendor,       DMI_BIOS_VENDOR);
+	ADD_DMI_ATTR(bios_version,      DMI_BIOS_VERSION);
+	ADD_DMI_ATTR(bios_date,         DMI_BIOS_DATE);
+	ADD_DMI_ATTR(sys_vendor,        DMI_SYS_VENDOR);
+	ADD_DMI_ATTR(product_name,      DMI_PRODUCT_NAME);
+	ADD_DMI_ATTR(product_version,   DMI_PRODUCT_VERSION);
+	ADD_DMI_ATTR(product_serial,    DMI_PRODUCT_SERIAL);
+	ADD_DMI_ATTR(product_uuid,      DMI_PRODUCT_UUID);
+	ADD_DMI_ATTR(board_vendor,      DMI_BOARD_VENDOR);
+	ADD_DMI_ATTR(board_name,        DMI_BOARD_NAME);
+	ADD_DMI_ATTR(board_version,     DMI_BOARD_VERSION);
+	ADD_DMI_ATTR(board_serial,      DMI_BOARD_SERIAL);
+	ADD_DMI_ATTR(board_asset_tag,   DMI_BOARD_ASSET_TAG);
+	ADD_DMI_ATTR(chassis_vendor,    DMI_CHASSIS_VENDOR);
+	ADD_DMI_ATTR(chassis_type,      DMI_CHASSIS_TYPE);
+	ADD_DMI_ATTR(chassis_version,   DMI_CHASSIS_VERSION);
+	ADD_DMI_ATTR(chassis_serial,    DMI_CHASSIS_SERIAL);
+	ADD_DMI_ATTR(chassis_asset_tag, DMI_CHASSIS_ASSET_TAG);
+	sys_dmi_attributes[i++] = &sys_dmi_modalias_attr.attr;
+
+	ret = class_register(&dmi_class);
+	if (ret)
+		return ret;
+
+	dmi_dev = kzalloc(sizeof(*dmi_dev), GFP_KERNEL);
+	if (!dmi_dev) {
+		ret = -ENOMEM;
+		goto fail_class_unregister;
+	}
+
+	dmi_dev->class = &dmi_class;
+	strcpy(dmi_dev->bus_id, "id");
+	dmi_dev->groups = sys_dmi_attribute_groups;
+
+	ret = device_register(dmi_dev);
+	if (ret)
+		goto fail_class_unregister;
+
+	return 0;
+
+fail_class_unregister:
+
+	class_unregister(&dmi_class);
+
+	return ret;
+}
+
+arch_initcall(dmi_id_init);
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 37deee6c0c1c..f7318b3b51f2 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -84,6 +84,7 @@ static int __init dmi_checksum(u8 *buf)
 
 static char *dmi_ident[DMI_STRING_MAX];
 static LIST_HEAD(dmi_devices);
+int dmi_available;
 
 /*
  *	Save a DMI string
@@ -102,6 +103,51 @@ static void __init dmi_save_ident(struct dmi_header *dm, int slot, int string)
 	dmi_ident[slot] = p;
 }
 
+static void __init dmi_save_uuid(struct dmi_header *dm, int slot, int index)
+{
+	u8 *d = (u8*) dm + index;
+	char *s;
+	int is_ff = 1, is_00 = 1, i;
+
+	if (dmi_ident[slot])
+		return;
+
+	for (i = 0; i < 16 && (is_ff || is_00); i++) {
+		if(d[i] != 0x00) is_ff = 0;
+		if(d[i] != 0xFF) is_00 = 0;
+	}
+
+	if (is_ff || is_00)
+		return;
+
+	s = dmi_alloc(16*2+4+1);
+	if (!s)
+		return;
+
+	sprintf(s,
+		"%02X%02X%02X%02X-%02X%02X-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X",
+		d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7],
+		d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]);
+
+        dmi_ident[slot] = s;
+}
+
+static void __init dmi_save_type(struct dmi_header *dm, int slot, int index)
+{
+	u8 *d = (u8*) dm + index;
+	char *s;
+
+	if (dmi_ident[slot])
+		return;
+
+	s = dmi_alloc(4);
+	if (!s)
+		return;
+
+	sprintf(s, "%u", *d & 0x7F);
+	dmi_ident[slot] = s;
+}
+
 static void __init dmi_save_devices(struct dmi_header *dm)
 {
 	int i, count = (dm->length - sizeof(struct dmi_header)) / 2;
@@ -192,11 +238,21 @@ static void __init dmi_decode(struct dmi_header *dm)
 		dmi_save_ident(dm, DMI_PRODUCT_NAME, 5);
 		dmi_save_ident(dm, DMI_PRODUCT_VERSION, 6);
 		dmi_save_ident(dm, DMI_PRODUCT_SERIAL, 7);
+		dmi_save_uuid(dm, DMI_PRODUCT_UUID, 8);
 		break;
 	case 2:		/* Base Board Information */
 		dmi_save_ident(dm, DMI_BOARD_VENDOR, 4);
 		dmi_save_ident(dm, DMI_BOARD_NAME, 5);
 		dmi_save_ident(dm, DMI_BOARD_VERSION, 6);
+		dmi_save_ident(dm, DMI_BOARD_SERIAL, 7);
+		dmi_save_ident(dm, DMI_BOARD_ASSET_TAG, 8);
+		break;
+	case 3:		/* Chassis Information */
+		dmi_save_ident(dm, DMI_CHASSIS_VENDOR, 4);
+		dmi_save_type(dm, DMI_CHASSIS_TYPE, 5);
+		dmi_save_ident(dm, DMI_CHASSIS_VERSION, 6);
+		dmi_save_ident(dm, DMI_CHASSIS_SERIAL, 7);
+		dmi_save_ident(dm, DMI_CHASSIS_ASSET_TAG, 8);
 		break;
 	case 10:	/* Onboard Devices Information */
 		dmi_save_devices(dm);
@@ -243,18 +299,20 @@ void __init dmi_scan_machine(void)
 		if (efi.smbios == EFI_INVALID_TABLE_ADDR)
 			goto out;
 
-               /* This is called as a core_initcall() because it isn't
-                * needed during early boot.  This also means we can
-                * iounmap the space when we're done with it.
-		*/
+		/* This is called as a core_initcall() because it isn't
+		 * needed during early boot.  This also means we can
+		 * iounmap the space when we're done with it.
+		 */
 		p = dmi_ioremap(efi.smbios, 32);
 		if (p == NULL)
 			goto out;
 
 		rc = dmi_present(p + 0x10); /* offset of _DMI_ string */
 		dmi_iounmap(p, 32);
-		if (!rc)
+		if (!rc) {
+			dmi_available = 1;
 			return;
+		}
 	}
 	else {
 		/*
@@ -268,8 +326,10 @@ void __init dmi_scan_machine(void)
 
 		for (q = p; q < p + 0x10000; q += 16) {
 			rc = dmi_present(q);
-			if (!rc)
+			if (!rc) {
+				dmi_available = 1;
 				return;
+			}
 		}
 	}
  out:	printk(KERN_INFO "DMI not present or invalid.\n");
@@ -404,3 +464,4 @@ int dmi_get_year(int field)
 
 	return year;
 }
+
diff --git a/drivers/misc/msi-laptop.c b/drivers/misc/msi-laptop.c
index 41e901f53e7c..932a415197b3 100644
--- a/drivers/misc/msi-laptop.c
+++ b/drivers/misc/msi-laptop.c
@@ -23,6 +23,8 @@
  * msi-laptop.c - MSI S270 laptop support. This laptop is sold under
  * various brands, including "Cytron/TCM/Medion/Tchibo MD96100".
  *
+ * Driver also supports S271, S420 models.
+ *
  * This driver exports a few files in /sys/devices/platform/msi-laptop-pf/:
  *
  *   lcd_level - Screen brightness: contains a single integer in the
@@ -281,25 +283,56 @@ static struct platform_device *msipf_device;
 
 /* Initialization */
 
+static int dmi_check_cb(struct dmi_system_id *id)
+{
+        printk("msi-laptop: Identified laptop model '%s'.\n", id->ident);
+        return 0;
+}
+
 static struct dmi_system_id __initdata msi_dmi_table[] = {
 	{
 		.ident = "MSI S270",
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "MICRO-STAR INT'L CO.,LTD"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "MS-1013"),
-		}
+			DMI_MATCH(DMI_PRODUCT_VERSION, "0131"),
+			DMI_MATCH(DMI_CHASSIS_VENDOR, "MICRO-STAR INT'L CO.,LTD")
+		},
+		.callback = dmi_check_cb
+	},
+	{
+		.ident = "MSI S271",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Micro-Star International"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "MS-1058"),
+			DMI_MATCH(DMI_PRODUCT_VERSION, "0581"),
+			DMI_MATCH(DMI_BOARD_NAME, "MS-1058")
+		},
+		.callback = dmi_check_cb
+	},
+	{
+		.ident = "MSI S420",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Micro-Star International"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "MS-1412"),
+			DMI_MATCH(DMI_BOARD_VENDOR, "MSI"),
+			DMI_MATCH(DMI_BOARD_NAME, "MS-1412")
+		},
+		.callback = dmi_check_cb
 	},
 	{
 		.ident = "Medion MD96100",
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "NOTEBOOK"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "SAM2000"),
-		}
+			DMI_MATCH(DMI_PRODUCT_VERSION, "0131"),
+			DMI_MATCH(DMI_CHASSIS_VENDOR, "MICRO-STAR INT'L CO.,LTD")
+		},
+		.callback = dmi_check_cb
 	},
 	{ }
 };
 
-
 static int __init msi_init(void)
 {
 	int ret;
@@ -394,3 +427,8 @@ MODULE_AUTHOR("Lennart Poettering");
 MODULE_DESCRIPTION("MSI Laptop Support");
 MODULE_VERSION(MSI_DRIVER_VERSION);
 MODULE_LICENSE("GPL");
+
+MODULE_ALIAS("dmi:*:svnMICRO-STARINT'LCO.,LTD:pnMS-1013:pvr0131*:cvnMICRO-STARINT'LCO.,LTD:ct10:*");
+MODULE_ALIAS("dmi:*:svnMicro-StarInternational:pnMS-1058:pvr0581:rvnMSI:rnMS-1058:*:ct10:*");
+MODULE_ALIAS("dmi:*:svnMicro-StarInternational:pnMS-1412:*:rvnMSI:rnMS-1412:*:cvnMICRO-STARINT'LCO.,LTD:ct10:*");
+MODULE_ALIAS("dmi:*:svnNOTEBOOK:pnSAM2000:pvr0131*:cvnMICRO-STARINT'LCO.,LTD:ct10:*");
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index 904bf3d2d90b..b8ac7b01c45e 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -12,9 +12,17 @@ enum dmi_field {
 	DMI_PRODUCT_NAME,
 	DMI_PRODUCT_VERSION,
 	DMI_PRODUCT_SERIAL,
+	DMI_PRODUCT_UUID,
 	DMI_BOARD_VENDOR,
 	DMI_BOARD_NAME,
 	DMI_BOARD_VERSION,
+	DMI_BOARD_SERIAL,
+	DMI_BOARD_ASSET_TAG,
+	DMI_CHASSIS_VENDOR,
+	DMI_CHASSIS_TYPE,
+	DMI_CHASSIS_VERSION,
+	DMI_CHASSIS_SERIAL,
+	DMI_CHASSIS_ASSET_TAG,
 	DMI_STRING_MAX,
 };
 
-- 
cgit v1.3-8-gc7d7


From 9cddad77574313fcee36c5e60122718daa7c0361 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 13 Jun 2007 15:53:34 +0200
Subject: PM: Remove pm_parent from struct dev_pm_info

The pm_parent member of struct dev_pm_info (defined in include/linux/pm.h) is
only used to check if the device's parent is in the right state while the
device is being suspended or resumed.  However, this can be done just as well
with the help of the parent pointer in struct device, so pm_parent can be
removed along with some code that handles it.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/power/main.c    | 30 ++++--------------------------
 drivers/base/power/resume.c  |  7 +++----
 drivers/base/power/suspend.c |  7 +++----
 include/linux/pm.h           |  3 ---
 4 files changed, 10 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 7b3cc3c15b9d..eb9f38d0aa58 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -33,28 +33,7 @@ DEFINE_MUTEX(dpm_list_mtx);
 
 int (*platform_enable_wakeup)(struct device *dev, int is_on);
 
-
-/**
- *	device_pm_set_parent - Specify power dependency.
- *	@dev:		Device who needs power.
- *	@parent:	Device that supplies power.
- *
- *	This function is used to manually describe a power-dependency
- *	relationship. It may be used to specify a transversal relationship
- *	(where the power supplier is not the physical (or electrical)
- *	ancestor of a specific device.
- *	The effect of this is that the supplier will not be powered down
- *	before the power dependent.
- */
-
-void device_pm_set_parent(struct device * dev, struct device * parent)
-{
-	put_device(dev->power.pm_parent);
-	dev->power.pm_parent = get_device(parent);
-}
-EXPORT_SYMBOL_GPL(device_pm_set_parent);
-
-int device_pm_add(struct device * dev)
+int device_pm_add(struct device *dev)
 {
 	int error;
 
@@ -63,21 +42,20 @@ int device_pm_add(struct device * dev)
 		 kobject_name(&dev->kobj));
 	mutex_lock(&dpm_list_mtx);
 	list_add_tail(&dev->power.entry, &dpm_active);
-	device_pm_set_parent(dev, dev->parent);
-	if ((error = dpm_sysfs_add(dev)))
+	error = dpm_sysfs_add(dev);
+	if (error)
 		list_del(&dev->power.entry);
 	mutex_unlock(&dpm_list_mtx);
 	return error;
 }
 
-void device_pm_remove(struct device * dev)
+void device_pm_remove(struct device *dev)
 {
 	pr_debug("PM: Removing info for %s:%s\n",
 		 dev->bus ? dev->bus->name : "No Bus",
 		 kobject_name(&dev->kobj));
 	mutex_lock(&dpm_list_mtx);
 	dpm_sysfs_remove(dev);
-	put_device(dev->power.pm_parent);
 	list_del_init(&dev->power.entry);
 	mutex_unlock(&dpm_list_mtx);
 }
diff --git a/drivers/base/power/resume.c b/drivers/base/power/resume.c
index f6cfea496ea0..99679e7a6cc6 100644
--- a/drivers/base/power/resume.c
+++ b/drivers/base/power/resume.c
@@ -29,12 +29,11 @@ int resume_device(struct device * dev)
 
 	down(&dev->sem);
 
-	if (dev->power.pm_parent
-			&& dev->power.pm_parent->power.power_state.event) {
+	if (dev->parent && dev->parent->power.power_state.event) {
 		dev_err(dev, "PM: resume from %d, parent %s still %d\n",
 			dev->power.power_state.event,
-			dev->power.pm_parent->bus_id,
-			dev->power.pm_parent->power.power_state.event);
+			dev->parent->bus_id,
+			dev->parent->power.power_state.event);
 	}
 
 	if (dev->bus && dev->bus->resume) {
diff --git a/drivers/base/power/suspend.c b/drivers/base/power/suspend.c
index 9d6701cd7f10..19fae88de7b3 100644
--- a/drivers/base/power/suspend.c
+++ b/drivers/base/power/suspend.c
@@ -55,13 +55,12 @@ int suspend_device(struct device * dev, pm_message_t state)
 		dev_dbg(dev, "PM: suspend %d-->%d\n",
 			dev->power.power_state.event, state.event);
 	}
-	if (dev->power.pm_parent
-			&& dev->power.pm_parent->power.power_state.event) {
+	if (dev->parent && dev->parent->power.power_state.event) {
 		dev_err(dev,
 			"PM: suspend %d->%d, parent %s already %d\n",
 			dev->power.power_state.event, state.event,
-			dev->power.pm_parent->bus_id,
-			dev->power.pm_parent->power.power_state.event);
+			dev->parent->bus_id,
+			dev->parent->power.power_state.event);
 	}
 
 	dev->power.prev_state = dev->power.power_state;
diff --git a/include/linux/pm.h b/include/linux/pm.h
index b2c4fde4e994..3fd65ad4b097 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -269,13 +269,10 @@ struct dev_pm_info {
 	unsigned		should_wakeup:1;
 	pm_message_t		prev_state;
 	void			* saved_state;
-	struct device		* pm_parent;
 	struct list_head	entry;
 #endif
 };
 
-extern void device_pm_set_parent(struct device * dev, struct device * parent);
-
 extern int device_power_down(pm_message_t state);
 extern void device_power_up(void);
 extern void device_resume(void);
-- 
cgit v1.3-8-gc7d7


From cc4900690bf77257996e90f0059eb074b8db52e6 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 13 Jun 2007 15:55:34 +0200
Subject: PM: Remove saved_state from struct dev_pm_info

The saved_state member of struct dev_pm_info, defined in include/linux/pm.h, is
not used anywhere, so it can be removed.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/pm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index 3fd65ad4b097..6e7f06671683 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -268,7 +268,6 @@ struct dev_pm_info {
 #ifdef	CONFIG_PM
 	unsigned		should_wakeup:1;
 	pm_message_t		prev_state;
-	void			* saved_state;
 	struct list_head	entry;
 #endif
 };
-- 
cgit v1.3-8-gc7d7


From 515c53576299e32d6bdb6295cfa2fe1307516eb4 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 17 Jun 2007 19:48:06 +0200
Subject: PM: Remove prev_state from struct dev_pm_info

The prev_state member of struct dev_pm_info (defined in include/linux/pm.h) is
only used during a resume to check if the device's state before the suspend was
'off', in which case the device is not resumed.  However, in such cases the
decision whether or not to resume the device should be made on the driver level
and the resume callbacks from the device's bus and class should be executed
anyway (the may be needed for some things other than just powering on the
device).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/power/resume.c  | 3 +--
 drivers/base/power/suspend.c | 2 --
 drivers/usb/core/hub.c       | 5 -----
 include/linux/pm.h           | 1 -
 4 files changed, 1 insertion(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/resume.c b/drivers/base/power/resume.c
index 99679e7a6cc6..0c9610688542 100644
--- a/drivers/base/power/resume.c
+++ b/drivers/base/power/resume.c
@@ -88,8 +88,7 @@ void dpm_resume(void)
 		list_move_tail(entry, &dpm_active);
 
 		mutex_unlock(&dpm_list_mtx);
-		if (!dev->power.prev_state.event)
-			resume_device(dev);
+		resume_device(dev);
 		mutex_lock(&dpm_list_mtx);
 		put_device(dev);
 	}
diff --git a/drivers/base/power/suspend.c b/drivers/base/power/suspend.c
index af2cedfbc1b4..5178b0fbd82e 100644
--- a/drivers/base/power/suspend.c
+++ b/drivers/base/power/suspend.c
@@ -71,8 +71,6 @@ int suspend_device(struct device * dev, pm_message_t state)
 			dev->parent->power.power_state.event);
 	}
 
-	dev->power.prev_state = dev->power.power_state;
-
 	if (dev->class && dev->class->suspend && !dev->power.power_state.event) {
 		suspend_device_dbg(dev, state, "class ");
 		error = dev->class->suspend(dev, state);
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 24f10a19dbdb..a9cf8b30bccc 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -1109,11 +1109,6 @@ void usb_root_hub_lost_power(struct usb_device *rhdev)
 
 	dev_warn(&rhdev->dev, "root hub lost power or was reset\n");
 
-	/* Make sure no potential wakeup events get lost,
-	 * by forcing the root hub to be resumed.
-	 */
-	rhdev->dev.power.prev_state.event = PM_EVENT_ON;
-
 	spin_lock_irqsave(&device_state_lock, flags);
 	hub = hdev_to_hub(rhdev);
 	for (port1 = 1; port1 <= rhdev->maxchild; ++port1) {
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 6e7f06671683..273781c82e4d 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -267,7 +267,6 @@ struct dev_pm_info {
 	unsigned		can_wakeup:1;
 #ifdef	CONFIG_PM
 	unsigned		should_wakeup:1;
-	pm_message_t		prev_state;
 	struct list_head	entry;
 #endif
 };
-- 
cgit v1.3-8-gc7d7


From 72dba584b695d8bc8c1a50ed54ad4cba7c62314d Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 14 Jun 2007 03:45:13 +0900
Subject: ida: implement idr based id allocator

Implement idr based id allocator.  ida is used the same way idr is
used but lacks id -> ptr translation and thus consumes much less
memory.  struct ida_bitmap is attached as leaf nodes to idr tree which
is managed by the idr code.  Each ida_bitmap is 128bytes long and
contains slightly less than a thousand slots.

ida is more aggressive with releasing extra resources acquired using
ida_pre_get().  After every successful id allocation, ida frees one
reserved idr_layer if possible.  Reserved ida_bitmap is not freed
automatically but only one ida_bitmap is reserved and it's almost
always used right away.  Under most circumstances, ida won't hold on
to memory for too long which isn't actively used.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/idr.h |  29 +++++++
 lib/idr.c           | 245 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 274 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 826803449db7..915572fa030b 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -83,4 +83,33 @@ void idr_remove(struct idr *idp, int id);
 void idr_destroy(struct idr *idp);
 void idr_init(struct idr *idp);
 
+
+/*
+ * IDA - IDR based id allocator, use when translation from id to
+ * pointer isn't necessary.
+ */
+#define IDA_CHUNK_SIZE		128	/* 128 bytes per chunk */
+#define IDA_BITMAP_LONGS	(128 / sizeof(long) - 1)
+#define IDA_BITMAP_BITS		(IDA_BITMAP_LONGS * sizeof(long) * 8)
+
+struct ida_bitmap {
+	long			nr_busy;
+	unsigned long		bitmap[IDA_BITMAP_LONGS];
+};
+
+struct ida {
+	struct idr		idr;
+	struct ida_bitmap	*free_bitmap;
+};
+
+#define IDA_INIT(name)		{ .idr = IDR_INIT(name), .free_bitmap = NULL, }
+#define DEFINE_IDA(name)	struct ida name = IDA_INIT(name)
+
+int ida_pre_get(struct ida *ida, gfp_t gfp_mask);
+int ida_get_new_above(struct ida *ida, int starting_id, int *p_id);
+int ida_get_new(struct ida *ida, int *p_id);
+void ida_remove(struct ida *ida, int id);
+void ida_destroy(struct ida *ida);
+void ida_init(struct ida *ida);
+
 #endif /* __IDR_H__ */
diff --git a/lib/idr.c b/lib/idr.c
index 30b33e2e7a50..b98f01a2eb94 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -506,3 +506,248 @@ void idr_init(struct idr *idp)
 	spin_lock_init(&idp->lock);
 }
 EXPORT_SYMBOL(idr_init);
+
+
+/*
+ * IDA - IDR based ID allocator
+ *
+ * this is id allocator without id -> pointer translation.  Memory
+ * usage is much lower than full blown idr because each id only
+ * occupies a bit.  ida uses a custom leaf node which contains
+ * IDA_BITMAP_BITS slots.
+ *
+ * 2007-04-25  written by Tejun Heo <htejun@gmail.com>
+ */
+
+static void free_bitmap(struct ida *ida, struct ida_bitmap *bitmap)
+{
+	unsigned long flags;
+
+	if (!ida->free_bitmap) {
+		spin_lock_irqsave(&ida->idr.lock, flags);
+		if (!ida->free_bitmap) {
+			ida->free_bitmap = bitmap;
+			bitmap = NULL;
+		}
+		spin_unlock_irqrestore(&ida->idr.lock, flags);
+	}
+
+	kfree(bitmap);
+}
+
+/**
+ * ida_pre_get - reserve resources for ida allocation
+ * @ida:	ida handle
+ * @gfp_mask:	memory allocation flag
+ *
+ * This function should be called prior to locking and calling the
+ * following function.  It preallocates enough memory to satisfy the
+ * worst possible allocation.
+ *
+ * If the system is REALLY out of memory this function returns 0,
+ * otherwise 1.
+ */
+int ida_pre_get(struct ida *ida, gfp_t gfp_mask)
+{
+	/* allocate idr_layers */
+	if (!idr_pre_get(&ida->idr, gfp_mask))
+		return 0;
+
+	/* allocate free_bitmap */
+	if (!ida->free_bitmap) {
+		struct ida_bitmap *bitmap;
+
+		bitmap = kmalloc(sizeof(struct ida_bitmap), gfp_mask);
+		if (!bitmap)
+			return 0;
+
+		free_bitmap(ida, bitmap);
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(ida_pre_get);
+
+/**
+ * ida_get_new_above - allocate new ID above or equal to a start id
+ * @ida:	ida handle
+ * @staring_id:	id to start search at
+ * @p_id:	pointer to the allocated handle
+ *
+ * Allocate new ID above or equal to @ida.  It should be called with
+ * any required locks.
+ *
+ * If memory is required, it will return -EAGAIN, you should unlock
+ * and go back to the ida_pre_get() call.  If the ida is full, it will
+ * return -ENOSPC.
+ *
+ * @p_id returns a value in the range 0 ... 0x7fffffff.
+ */
+int ida_get_new_above(struct ida *ida, int starting_id, int *p_id)
+{
+	struct idr_layer *pa[MAX_LEVEL];
+	struct ida_bitmap *bitmap;
+	unsigned long flags;
+	int idr_id = starting_id / IDA_BITMAP_BITS;
+	int offset = starting_id % IDA_BITMAP_BITS;
+	int t, id;
+
+ restart:
+	/* get vacant slot */
+	t = idr_get_empty_slot(&ida->idr, idr_id, pa);
+	if (t < 0) {
+		if (t == -1)
+			return -EAGAIN;
+		else /* will be -3 */
+			return -ENOSPC;
+	}
+
+	if (t * IDA_BITMAP_BITS >= MAX_ID_BIT)
+		return -ENOSPC;
+
+	if (t != idr_id)
+		offset = 0;
+	idr_id = t;
+
+	/* if bitmap isn't there, create a new one */
+	bitmap = (void *)pa[0]->ary[idr_id & IDR_MASK];
+	if (!bitmap) {
+		spin_lock_irqsave(&ida->idr.lock, flags);
+		bitmap = ida->free_bitmap;
+		ida->free_bitmap = NULL;
+		spin_unlock_irqrestore(&ida->idr.lock, flags);
+
+		if (!bitmap)
+			return -EAGAIN;
+
+		memset(bitmap, 0, sizeof(struct ida_bitmap));
+		pa[0]->ary[idr_id & IDR_MASK] = (void *)bitmap;
+		pa[0]->count++;
+	}
+
+	/* lookup for empty slot */
+	t = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, offset);
+	if (t == IDA_BITMAP_BITS) {
+		/* no empty slot after offset, continue to the next chunk */
+		idr_id++;
+		offset = 0;
+		goto restart;
+	}
+
+	id = idr_id * IDA_BITMAP_BITS + t;
+	if (id >= MAX_ID_BIT)
+		return -ENOSPC;
+
+	__set_bit(t, bitmap->bitmap);
+	if (++bitmap->nr_busy == IDA_BITMAP_BITS)
+		idr_mark_full(pa, idr_id);
+
+	*p_id = id;
+
+	/* Each leaf node can handle nearly a thousand slots and the
+	 * whole idea of ida is to have small memory foot print.
+	 * Throw away extra resources one by one after each successful
+	 * allocation.
+	 */
+	if (ida->idr.id_free_cnt || ida->free_bitmap) {
+		struct idr_layer *p = alloc_layer(&ida->idr);
+		if (p)
+			kmem_cache_free(idr_layer_cache, p);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(ida_get_new_above);
+
+/**
+ * ida_get_new - allocate new ID
+ * @ida:	idr handle
+ * @p_id:	pointer to the allocated handle
+ *
+ * Allocate new ID.  It should be called with any required locks.
+ *
+ * If memory is required, it will return -EAGAIN, you should unlock
+ * and go back to the idr_pre_get() call.  If the idr is full, it will
+ * return -ENOSPC.
+ *
+ * @id returns a value in the range 0 ... 0x7fffffff.
+ */
+int ida_get_new(struct ida *ida, int *p_id)
+{
+	return ida_get_new_above(ida, 0, p_id);
+}
+EXPORT_SYMBOL(ida_get_new);
+
+/**
+ * ida_remove - remove the given ID
+ * @ida:	ida handle
+ * @id:		ID to free
+ */
+void ida_remove(struct ida *ida, int id)
+{
+	struct idr_layer *p = ida->idr.top;
+	int shift = (ida->idr.layers - 1) * IDR_BITS;
+	int idr_id = id / IDA_BITMAP_BITS;
+	int offset = id % IDA_BITMAP_BITS;
+	int n;
+	struct ida_bitmap *bitmap;
+
+	/* clear full bits while looking up the leaf idr_layer */
+	while ((shift > 0) && p) {
+		n = (idr_id >> shift) & IDR_MASK;
+		__clear_bit(n, &p->bitmap);
+		p = p->ary[n];
+		shift -= IDR_BITS;
+	}
+
+	if (p == NULL)
+		goto err;
+
+	n = idr_id & IDR_MASK;
+	__clear_bit(n, &p->bitmap);
+
+	bitmap = (void *)p->ary[n];
+	if (!test_bit(offset, bitmap->bitmap))
+		goto err;
+
+	/* update bitmap and remove it if empty */
+	__clear_bit(offset, bitmap->bitmap);
+	if (--bitmap->nr_busy == 0) {
+		__set_bit(n, &p->bitmap);	/* to please idr_remove() */
+		idr_remove(&ida->idr, idr_id);
+		free_bitmap(ida, bitmap);
+	}
+
+	return;
+
+ err:
+	printk(KERN_WARNING
+	       "ida_remove called for id=%d which is not allocated.\n", id);
+}
+EXPORT_SYMBOL(ida_remove);
+
+/**
+ * ida_destroy - release all cached layers within an ida tree
+ * ida:		ida handle
+ */
+void ida_destroy(struct ida *ida)
+{
+	idr_destroy(&ida->idr);
+	kfree(ida->free_bitmap);
+}
+EXPORT_SYMBOL(ida_destroy);
+
+/**
+ * ida_init - initialize ida handle
+ * @ida:	ida handle
+ *
+ * This function is use to set up the handle (@ida) that you will pass
+ * to the rest of the functions.
+ */
+void ida_init(struct ida *ida)
+{
+	memset(ida, 0, sizeof(struct ida));
+	idr_init(&ida->idr);
+
+}
+EXPORT_SYMBOL(ida_init);
-- 
cgit v1.3-8-gc7d7


From 0c096b507f15397da890051ee73de4266d3941fb Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 14 Jun 2007 03:45:15 +0900
Subject: sysfs: add sysfs_dirent->s_name

Add s_name to sysfs_dirent.  This is to further reduce dependency to
the associated dentry.  Name is copied for directories and symlinks
but not for attributes.

Where possible, name dereferences are converted to use sd->s_name.
sysfs_symlink->link_name and sysfs_get_name() are unused now and
removed.

This change allows symlink to be implemented using sysfs_dirent tree
proper, which is the last remaining dentry-dependent sysfs walk.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c        | 67 +++++++++++++++++++++++++++++++++------------------
 fs/sysfs/file.c       |  2 +-
 fs/sysfs/inode.c      | 33 +------------------------
 fs/sysfs/symlink.c    |  8 +-----
 fs/sysfs/sysfs.h      |  7 +++---
 include/linux/sysfs.h |  1 +
 6 files changed, 51 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5d50e1ddfbd1..6e8d6f54f082 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -54,10 +54,11 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
 
 	if (sd->s_type & SYSFS_KOBJ_LINK) {
 		struct sysfs_symlink * sl = sd->s_element;
-		kfree(sl->link_name);
 		kobject_put(sl->target_kobj);
 		kfree(sl);
 	}
+	if (sd->s_type & SYSFS_COPY_NAME)
+		kfree(sd->s_name);
 	kfree(sd->s_iattr);
 	sysfs_free_ino(sd->s_ino);
 	kmem_cache_free(sysfs_dir_cachep, sd);
@@ -94,29 +95,41 @@ static struct dentry_operations sysfs_dentry_ops = {
 	.d_iput		= sysfs_d_iput,
 };
 
-struct sysfs_dirent *sysfs_new_dirent(void *element, umode_t mode, int type)
+struct sysfs_dirent *sysfs_new_dirent(const char *name, void *element,
+				      umode_t mode, int type)
 {
-	struct sysfs_dirent * sd;
+	char *dup_name = NULL;
+	struct sysfs_dirent *sd = NULL;
+
+	if (type & SYSFS_COPY_NAME) {
+		name = dup_name = kstrdup(name, GFP_KERNEL);
+		if (!name)
+			goto err_out;
+	}
 
 	sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL);
 	if (!sd)
-		return NULL;
+		goto err_out;
 
-	if (sysfs_alloc_ino(&sd->s_ino)) {
-		kmem_cache_free(sysfs_dir_cachep, sd);
-		return NULL;
-	}
+	if (sysfs_alloc_ino(&sd->s_ino))
+		goto err_out;
 
 	atomic_set(&sd->s_count, 1);
 	atomic_set(&sd->s_event, 1);
 	INIT_LIST_HEAD(&sd->s_children);
 	INIT_LIST_HEAD(&sd->s_sibling);
 
+	sd->s_name = name;
 	sd->s_element = element;
 	sd->s_mode = mode;
 	sd->s_type = type;
 
 	return sd;
+
+ err_out:
+	kfree(dup_name);
+	kmem_cache_free(sysfs_dir_cachep, sd);
+	return NULL;
 }
 
 void sysfs_attach_dirent(struct sysfs_dirent *sd,
@@ -148,8 +161,7 @@ int sysfs_dirent_exist(struct sysfs_dirent *parent_sd,
 
 	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
 		if (sd->s_element) {
-			const unsigned char *existing = sysfs_get_name(sd);
-			if (strcmp(existing, new))
+			if (strcmp(sd->s_name, new))
 				continue;
 			else
 				return -EEXIST;
@@ -203,7 +215,7 @@ static int create_dir(struct kobject *kobj, struct dentry *parent,
 		goto out_dput;
 
 	error = -ENOMEM;
-	sd = sysfs_new_dirent(kobj, mode, SYSFS_DIR);
+	sd = sysfs_new_dirent(name, kobj, mode, SYSFS_DIR);
 	if (!sd)
 		goto out_drop;
 	sysfs_attach_dirent(sd, parent->d_fsdata, dentry);
@@ -334,9 +346,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
 		if (sd->s_type & SYSFS_NOT_PINNED) {
-			const unsigned char * name = sysfs_get_name(sd);
-
-			if (strcmp(name, dentry->d_name.name))
+			if (strcmp(sd->s_name, dentry->d_name.name))
 				continue;
 
 			if (sd->s_type & SYSFS_KOBJ_LINK)
@@ -427,9 +437,11 @@ void sysfs_remove_dir(struct kobject * kobj)
 int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent,
 		     const char *new_name)
 {
+	struct sysfs_dirent *sd = kobj->dentry->d_fsdata;
+	struct sysfs_dirent *parent_sd = new_parent->d_fsdata;
+	struct dentry *new_dentry;
+	char *dup_name;
 	int error;
-	struct dentry * new_dentry;
-	struct sysfs_dirent *sd, *parent_sd;
 
 	if (!new_parent)
 		return -EFAULT;
@@ -457,22 +469,31 @@ int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent,
 	if (new_dentry->d_inode)
 		goto out_dput;
 
+	/* rename kobject and sysfs_dirent */
+	error = -ENOMEM;
+	new_name = dup_name = kstrdup(new_name, GFP_KERNEL);
+	if (!new_name)
+		goto out_drop;
+
 	error = kobject_set_name(kobj, "%s", new_name);
 	if (error)
-		goto out_drop;
+		goto out_free;
 
+	kfree(sd->s_name);
+	sd->s_name = new_name;
+
+	/* move under the new parent */
 	d_add(new_dentry, NULL);
 	d_move(kobj->dentry, new_dentry);
 
-	sd = kobj->dentry->d_fsdata;
-	parent_sd = new_parent->d_fsdata;
-
 	list_del_init(&sd->s_sibling);
 	list_add(&sd->s_sibling, &parent_sd->s_children);
 
 	error = 0;
 	goto out_unlock;
 
+ out_free:
+	kfree(dup_name);
  out_drop:
 	d_drop(new_dentry);
  out_dput:
@@ -535,7 +556,7 @@ static int sysfs_dir_open(struct inode *inode, struct file *file)
 	struct sysfs_dirent * sd;
 
 	mutex_lock(&dentry->d_inode->i_mutex);
-	sd = sysfs_new_dirent(NULL, 0, 0);
+	sd = sysfs_new_dirent("_DIR_", NULL, 0, 0);
 	if (sd)
 		sysfs_attach_dirent(sd, parent_sd, NULL);
 	mutex_unlock(&dentry->d_inode->i_mutex);
@@ -605,7 +626,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 				if (!next->s_element)
 					continue;
 
-				name = sysfs_get_name(next);
+				name = next->s_name;
 				len = strlen(name);
 				ino = next->s_ino;
 
@@ -717,7 +738,7 @@ struct dentry *sysfs_create_shadow_dir(struct kobject *kobj)
 	if (!shadow)
 		goto nomem;
 
-	sd = sysfs_new_dirent(kobj, inode->i_mode, SYSFS_DIR);
+	sd = sysfs_new_dirent("_SHADOW_", kobj, inode->i_mode, SYSFS_DIR);
 	if (!sd)
 		goto nomem;
 	/* point to parent_sd but don't attach to it */
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index fd4b6dc03d2d..8240b1687dd0 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -454,7 +454,7 @@ int sysfs_add_file(struct dentry * dir, const struct attribute * attr, int type)
 		goto out_unlock;
 	}
 
-	sd = sysfs_new_dirent((void *)attr, mode, type);
+	sd = sysfs_new_dirent(attr->name, (void *)attr, mode, type);
 	if (!sd) {
 		error = -ENOMEM;
 		goto out_unlock;
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 5266eec15f6e..5c605b0003a8 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -191,37 +191,6 @@ int sysfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
 	return error;
 }
 
-/*
- * Get the name for corresponding element represented by the given sysfs_dirent
- */
-const unsigned char * sysfs_get_name(struct sysfs_dirent *sd)
-{
-	struct attribute * attr;
-	struct bin_attribute * bin_attr;
-	struct sysfs_symlink  * sl;
-
-	BUG_ON(!sd || !sd->s_element);
-
-	switch (sd->s_type) {
-		case SYSFS_DIR:
-			/* Always have a dentry so use that */
-			return sd->s_dentry->d_name.name;
-
-		case SYSFS_KOBJ_ATTR:
-			attr = sd->s_element;
-			return attr->name;
-
-		case SYSFS_KOBJ_BIN_ATTR:
-			bin_attr = sd->s_element;
-			return bin_attr->attr.name;
-
-		case SYSFS_KOBJ_LINK:
-			sl = sd->s_element;
-			return sl->link_name;
-	}
-	return NULL;
-}
-
 static inline void orphan_all_buffers(struct inode *node)
 {
 	struct sysfs_buffer_collection *set;
@@ -305,7 +274,7 @@ int sysfs_hash_and_remove(struct dentry * dir, const char * name)
 	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
 		if (!sd->s_element)
 			continue;
-		if (!strcmp(sysfs_get_name(sd), name)) {
+		if (!strcmp(sd->s_name, name)) {
 			list_del_init(&sd->s_sibling);
 			sysfs_drop_dentry(sd, dir);
 			sysfs_put(sd);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index d96bb9cbc9d4..c72820450e7c 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -57,14 +57,9 @@ static int sysfs_add_link(struct dentry * parent, const char * name, struct kobj
 	if (!sl)
 		goto err_out;
 
-	sl->link_name = kmalloc(strlen(name) + 1, GFP_KERNEL);
-	if (!sl->link_name)
-		goto err_out;
-
-	strcpy(sl->link_name, name);
 	sl->target_kobj = kobject_get(target);
 
-	sd = sysfs_new_dirent(sl, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK);
+	sd = sysfs_new_dirent(name, sl, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK);
 	if (!sd)
 		goto err_out;
 	sysfs_attach_dirent(sd, parent_sd, NULL);
@@ -74,7 +69,6 @@ static int sysfs_add_link(struct dentry * parent, const char * name, struct kobj
  err_out:
 	if (sl) {
 		kobject_put(sl->target_kobj);
-		kfree(sl->link_name);
 		kfree(sl);
 	}
 	return error;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index ce05d6fd7522..d34b008537d5 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -3,6 +3,7 @@ struct sysfs_dirent {
 	struct sysfs_dirent	* s_parent;
 	struct list_head	s_sibling;
 	struct list_head	s_children;
+	const char		* s_name;
 	void 			* s_element;
 	int			s_type;
 	umode_t			s_mode;
@@ -21,8 +22,8 @@ extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *));
 
 extern void release_sysfs_dirent(struct sysfs_dirent * sd);
 extern int sysfs_dirent_exist(struct sysfs_dirent *, const unsigned char *);
-extern struct sysfs_dirent *sysfs_new_dirent(void *element, umode_t mode,
-					     int type);
+extern struct sysfs_dirent *sysfs_new_dirent(const char *name, void *element,
+					     umode_t mode, int type);
 extern void sysfs_attach_dirent(struct sysfs_dirent *sd,
 				struct sysfs_dirent *parent_sd,
 				struct dentry *dentry);
@@ -34,7 +35,6 @@ extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * na
 extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **);
 extern void sysfs_remove_subdir(struct dentry *);
 
-extern const unsigned char * sysfs_get_name(struct sysfs_dirent *sd);
 extern void sysfs_drop_dentry(struct sysfs_dirent *sd, struct dentry *parent);
 extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 
@@ -48,7 +48,6 @@ extern const struct inode_operations sysfs_dir_inode_operations;
 extern const struct inode_operations sysfs_symlink_inode_operations;
 
 struct sysfs_symlink {
-	char * link_name;
 	struct kobject * target_kobj;
 };
 
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 7d5d1ec95c2e..2f86b080b39d 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -76,6 +76,7 @@ struct sysfs_ops {
 #define SYSFS_KOBJ_BIN_ATTR	0x0008
 #define SYSFS_KOBJ_LINK 	0x0020
 #define SYSFS_NOT_PINNED	(SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR | SYSFS_KOBJ_LINK)
+#define SYSFS_COPY_NAME		(SYSFS_DIR | SYSFS_KOBJ_LINK)
 
 #ifdef CONFIG_SYSFS
 
-- 
cgit v1.3-8-gc7d7


From 7b595756ec1f49e0049a9e01a1298d53a7faaa15 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 14 Jun 2007 03:45:17 +0900
Subject: sysfs: kill unnecessary attribute->owner

sysfs is now completely out of driver/module lifetime game.  After
deletion, a sysfs node doesn't access anything outside sysfs proper,
so there's no reason to hold onto the attribute owners.  Note that
often the wrong modules were accounted for as owners leading to
accessing removed modules.

This patch kills now unnecessary attribute->owner.  Note that with
this change, userland holding a sysfs node does not prevent the
backing module from being unloaded.

For more info regarding lifetime rule cleanup, please read the
following message.

  http://article.gmane.org/gmane.linux.kernel/510293

(tweaked by Greg to not delete the field just yet, to make it easier to
merge things properly.)

Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/ppc/syslib/mv64x60.c           |  1 -
 arch/s390/kernel/ipl.c              |  2 --
 drivers/base/bus.c                  |  2 --
 drivers/base/class.c                |  2 --
 drivers/base/core.c                 |  4 ----
 drivers/base/firmware_class.c       |  2 +-
 drivers/block/pktcdvd.c             |  3 +--
 drivers/char/ipmi/ipmi_msghandler.c | 10 ----------
 drivers/cpufreq/cpufreq_stats.c     |  3 +--
 drivers/cpufreq/cpufreq_userspace.c |  2 +-
 drivers/cpufreq/freq_table.c        |  1 -
 drivers/firmware/dcdbas.h           |  3 +--
 drivers/firmware/dell_rbu.c         |  6 +++---
 drivers/firmware/edd.c              |  2 +-
 drivers/firmware/efivars.c          |  6 +++---
 drivers/i2c/chips/eeprom.c          |  1 -
 drivers/i2c/chips/max6875.c         |  1 -
 drivers/infiniband/core/sysfs.c     |  1 -
 drivers/input/mouse/psmouse.h       |  1 -
 drivers/macintosh/windfarm_core.c   |  2 --
 drivers/misc/asus-laptop.c          |  3 +--
 drivers/net/ibmveth.c               |  2 +-
 drivers/parisc/pdc_stable.c         |  4 ++--
 drivers/pci/hotplug/acpiphp_ibm.c   |  1 -
 drivers/pci/pci-sysfs.c             |  4 ----
 drivers/pci/probe.c                 |  2 --
 drivers/pcmcia/socket_sysfs.c       |  2 +-
 drivers/rapidio/rio-sysfs.c         |  1 -
 drivers/rtc/rtc-ds1553.c            |  1 -
 drivers/rtc/rtc-ds1742.c            |  1 -
 drivers/s390/cio/chp.c              |  2 --
 drivers/s390/net/qeth_sys.c         |  2 +-
 drivers/scsi/arcmsr/arcmsr_attr.c   |  3 ---
 drivers/scsi/libsas/sas_expander.c  |  1 -
 drivers/scsi/lpfc/lpfc_attr.c       |  2 --
 drivers/scsi/qla2xxx/qla_attr.c     |  6 ------
 drivers/spi/at25.c                  |  1 -
 drivers/video/aty/radeon_base.c     |  2 --
 drivers/video/backlight/backlight.c |  2 +-
 drivers/video/backlight/lcd.c       |  2 +-
 drivers/w1/slaves/w1_ds2433.c       |  1 -
 drivers/w1/slaves/w1_therm.c        |  1 -
 drivers/w1/w1.c                     |  2 --
 drivers/zorro/zorro-sysfs.c         |  1 -
 fs/ecryptfs/main.c                  |  2 --
 fs/ocfs2/cluster/masklog.c          |  1 -
 fs/partitions/check.c               |  1 -
 fs/sysfs/bin.c                      | 19 +++++--------------
 fs/sysfs/file.c                     | 21 +++++----------------
 include/linux/sysdev.h              |  3 +--
 include/linux/sysfs.h               | 12 ++++++++----
 kernel/module.c                     |  9 +++------
 kernel/params.c                     |  1 -
 net/bridge/br_sysfs_br.c            |  3 +--
 net/bridge/br_sysfs_if.c            |  3 +--
 55 files changed, 44 insertions(+), 135 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ppc/syslib/mv64x60.c b/arch/ppc/syslib/mv64x60.c
index 8485a68cd475..032f4b7f4225 100644
--- a/arch/ppc/syslib/mv64x60.c
+++ b/arch/ppc/syslib/mv64x60.c
@@ -2415,7 +2415,6 @@ static struct bin_attribute mv64xxx_hs_reg_attr = { /* Hotswap register */
 	.attr = {
 		.name = "hs_reg",
 		.mode = S_IRUGO | S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size  = VAL_LEN_MAX,
 	.read  = mv64xxx_hs_reg_read,
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 82b131ddd7ff..9a13b24ee1ab 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -312,7 +312,6 @@ static struct bin_attribute ipl_parameter_attr = {
 	.attr = {
 		.name = "binary_parameter",
 		.mode = S_IRUGO,
-		.owner = THIS_MODULE,
 	},
 	.size = PAGE_SIZE,
 	.read = &ipl_parameter_read,
@@ -336,7 +335,6 @@ static struct bin_attribute ipl_scp_data_attr = {
 	.attr = {
 		.name = "scp_data",
 		.mode = S_IRUGO,
-		.owner = THIS_MODULE,
 	},
 	.size = PAGE_SIZE,
 	.read = &ipl_scp_data_read,
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index f299e0d6abc4..61c67526a656 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -574,7 +574,6 @@ static int add_probe_files(struct bus_type *bus)
 
 	bus->drivers_probe_attr.attr.name = "drivers_probe";
 	bus->drivers_probe_attr.attr.mode = S_IWUSR;
-	bus->drivers_probe_attr.attr.owner = bus->owner;
 	bus->drivers_probe_attr.store = store_drivers_probe;
 	retval = bus_create_file(bus, &bus->drivers_probe_attr);
 	if (retval)
@@ -582,7 +581,6 @@ static int add_probe_files(struct bus_type *bus)
 
 	bus->drivers_autoprobe_attr.attr.name = "drivers_autoprobe";
 	bus->drivers_autoprobe_attr.attr.mode = S_IWUSR | S_IRUGO;
-	bus->drivers_autoprobe_attr.attr.owner = bus->owner;
 	bus->drivers_autoprobe_attr.show = show_drivers_autoprobe;
 	bus->drivers_autoprobe_attr.store = store_drivers_autoprobe;
 	retval = bus_create_file(bus, &bus->drivers_autoprobe_attr);
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 8c506dbe3913..9cbfde23b9e3 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -605,7 +605,6 @@ int class_device_add(struct class_device *class_dev)
 		goto out3;
 	class_dev->uevent_attr.attr.name = "uevent";
 	class_dev->uevent_attr.attr.mode = S_IWUSR;
-	class_dev->uevent_attr.attr.owner = parent_class->owner;
 	class_dev->uevent_attr.store = store_uevent;
 	error = class_device_create_file(class_dev, &class_dev->uevent_attr);
 	if (error)
@@ -620,7 +619,6 @@ int class_device_add(struct class_device *class_dev)
 		}
 		attr->attr.name = "dev";
 		attr->attr.mode = S_IRUGO;
-		attr->attr.owner = parent_class->owner;
 		attr->show = show_dev;
 		error = class_device_create_file(class_dev, attr);
 		if (error) {
diff --git a/drivers/base/core.c b/drivers/base/core.c
index cff4fbfbb055..e3fb87bfc6e1 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -683,8 +683,6 @@ int device_add(struct device *dev)
 
 	dev->uevent_attr.attr.name = "uevent";
 	dev->uevent_attr.attr.mode = S_IRUGO | S_IWUSR;
-	if (dev->driver)
-		dev->uevent_attr.attr.owner = dev->driver->owner;
 	dev->uevent_attr.store = store_uevent;
 	dev->uevent_attr.show = show_uevent;
 	error = device_create_file(dev, &dev->uevent_attr);
@@ -700,8 +698,6 @@ int device_add(struct device *dev)
 		}
 		attr->attr.name = "dev";
 		attr->attr.mode = S_IRUGO;
-		if (dev->driver)
-			attr->attr.owner = dev->driver->owner;
 		attr->show = show_dev;
 		error = device_create_file(dev, attr);
 		if (error) {
diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 89a5f4a54913..0e511485d2e6 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -271,7 +271,7 @@ out:
 }
 
 static struct bin_attribute firmware_attr_data_tmpl = {
-	.attr = {.name = "data", .mode = 0644, .owner = THIS_MODULE},
+	.attr = {.name = "data", .mode = 0644},
 	.size = 0,
 	.read = firmware_data_read,
 	.write = firmware_data_write,
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index f1b9dd7d47d6..ce64e86d6ffb 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -146,8 +146,7 @@ static void pkt_kobj_release(struct kobject *kobj)
  **********************************************************/
 
 #define DEF_ATTR(_obj,_name,_mode) \
-	static struct attribute _obj = { \
-		.name = _name, .owner = THIS_MODULE, .mode = _mode }
+	static struct attribute _obj = { .name = _name, .mode = _mode }
 
 /**********************************************************
   /sys/class/pktcdvd/pktcdvd[0-7]/
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 8e222f2b80cc..b5df7e61aeb2 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -2171,52 +2171,42 @@ static int create_files(struct bmc_device *bmc)
 	int err;
 
 	bmc->device_id_attr.attr.name = "device_id";
-	bmc->device_id_attr.attr.owner = THIS_MODULE;
 	bmc->device_id_attr.attr.mode = S_IRUGO;
 	bmc->device_id_attr.show = device_id_show;
 
 	bmc->provides_dev_sdrs_attr.attr.name = "provides_device_sdrs";
-	bmc->provides_dev_sdrs_attr.attr.owner = THIS_MODULE;
 	bmc->provides_dev_sdrs_attr.attr.mode = S_IRUGO;
 	bmc->provides_dev_sdrs_attr.show = provides_dev_sdrs_show;
 
 	bmc->revision_attr.attr.name = "revision";
-	bmc->revision_attr.attr.owner = THIS_MODULE;
 	bmc->revision_attr.attr.mode = S_IRUGO;
 	bmc->revision_attr.show = revision_show;
 
 	bmc->firmware_rev_attr.attr.name = "firmware_revision";
-	bmc->firmware_rev_attr.attr.owner = THIS_MODULE;
 	bmc->firmware_rev_attr.attr.mode = S_IRUGO;
 	bmc->firmware_rev_attr.show = firmware_rev_show;
 
 	bmc->version_attr.attr.name = "ipmi_version";
-	bmc->version_attr.attr.owner = THIS_MODULE;
 	bmc->version_attr.attr.mode = S_IRUGO;
 	bmc->version_attr.show = ipmi_version_show;
 
 	bmc->add_dev_support_attr.attr.name = "additional_device_support";
-	bmc->add_dev_support_attr.attr.owner = THIS_MODULE;
 	bmc->add_dev_support_attr.attr.mode = S_IRUGO;
 	bmc->add_dev_support_attr.show = add_dev_support_show;
 
 	bmc->manufacturer_id_attr.attr.name = "manufacturer_id";
-	bmc->manufacturer_id_attr.attr.owner = THIS_MODULE;
 	bmc->manufacturer_id_attr.attr.mode = S_IRUGO;
 	bmc->manufacturer_id_attr.show = manufacturer_id_show;
 
 	bmc->product_id_attr.attr.name = "product_id";
-	bmc->product_id_attr.attr.owner = THIS_MODULE;
 	bmc->product_id_attr.attr.mode = S_IRUGO;
 	bmc->product_id_attr.show = product_id_show;
 
 	bmc->guid_attr.attr.name = "guid";
-	bmc->guid_attr.attr.owner = THIS_MODULE;
 	bmc->guid_attr.attr.mode = S_IRUGO;
 	bmc->guid_attr.show = guid_show;
 
 	bmc->aux_firmware_rev_attr.attr.name = "aux_firmware_revision";
-	bmc->aux_firmware_rev_attr.attr.owner = THIS_MODULE;
 	bmc->aux_firmware_rev_attr.attr.mode = S_IRUGO;
 	bmc->aux_firmware_rev_attr.show = aux_firmware_rev_show;
 
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index d2f0cbd8b8f3..917b9bab9ccb 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -25,8 +25,7 @@ static spinlock_t cpufreq_stats_lock;
 
 #define CPUFREQ_STATDEVICE_ATTR(_name,_mode,_show) \
 static struct freq_attr _attr_##_name = {\
-	.attr = {.name = __stringify(_name), .owner = THIS_MODULE, \
-		.mode = _mode, }, \
+	.attr = {.name = __stringify(_name), .mode = _mode, }, \
 	.show = _show,\
 };
 
diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c
index 860345c7799a..a648970338b0 100644
--- a/drivers/cpufreq/cpufreq_userspace.c
+++ b/drivers/cpufreq/cpufreq_userspace.c
@@ -120,7 +120,7 @@ store_speed (struct cpufreq_policy *policy, const char *buf, size_t count)
 
 static struct freq_attr freq_attr_scaling_setspeed =
 {
-	.attr = { .name = "scaling_setspeed", .mode = 0644, .owner = THIS_MODULE },
+	.attr = { .name = "scaling_setspeed", .mode = 0644 },
 	.show = show_speed,
 	.store = store_speed,
 };
diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c
index e7490925fdcf..5409f3afb3f8 100644
--- a/drivers/cpufreq/freq_table.c
+++ b/drivers/cpufreq/freq_table.c
@@ -199,7 +199,6 @@ static ssize_t show_available_freqs (struct cpufreq_policy *policy, char *buf)
 struct freq_attr cpufreq_freq_attr_scaling_available_freqs = {
 	.attr = { .name = "scaling_available_frequencies",
 		  .mode = 0444,
-		  .owner=THIS_MODULE
 		},
 	.show = show_available_freqs,
 };
diff --git a/drivers/firmware/dcdbas.h b/drivers/firmware/dcdbas.h
index 58a85182b3e8..dcdba0f1b32c 100644
--- a/drivers/firmware/dcdbas.h
+++ b/drivers/firmware/dcdbas.h
@@ -67,8 +67,7 @@
 #define DCDBAS_BIN_ATTR_RW(_name) \
 struct bin_attribute bin_attr_##_name = { \
 	.attr =  { .name = __stringify(_name), \
-		   .mode = 0600, \
-		   .owner = THIS_MODULE }, \
+		   .mode = 0600 }, \
 	.read =  _name##_read, \
 	.write = _name##_write, \
 }
diff --git a/drivers/firmware/dell_rbu.c b/drivers/firmware/dell_rbu.c
index fc702e40bd43..f8afecb7d0cf 100644
--- a/drivers/firmware/dell_rbu.c
+++ b/drivers/firmware/dell_rbu.c
@@ -687,18 +687,18 @@ static ssize_t write_rbu_packet_size(struct kobject *kobj, char *buffer,
 }
 
 static struct bin_attribute rbu_data_attr = {
-	.attr = {.name = "data",.owner = THIS_MODULE,.mode = 0444},
+	.attr = {.name = "data", .mode = 0444},
 	.read = read_rbu_data,
 };
 
 static struct bin_attribute rbu_image_type_attr = {
-	.attr = {.name = "image_type",.owner = THIS_MODULE,.mode = 0644},
+	.attr = {.name = "image_type", .mode = 0644},
 	.read = read_rbu_image_type,
 	.write = write_rbu_image_type,
 };
 
 static struct bin_attribute rbu_packet_size_attr = {
-	.attr = {.name = "packet_size",.owner = THIS_MODULE,.mode = 0644},
+	.attr = {.name = "packet_size", .mode = 0644},
 	.read = read_rbu_packet_size,
 	.write = write_rbu_packet_size,
 };
diff --git a/drivers/firmware/edd.c b/drivers/firmware/edd.c
index d8806e4f1829..15232271d848 100644
--- a/drivers/firmware/edd.c
+++ b/drivers/firmware/edd.c
@@ -74,7 +74,7 @@ static struct edd_device *edd_devices[EDD_MBR_SIG_MAX];
 
 #define EDD_DEVICE_ATTR(_name,_mode,_show,_test) \
 struct edd_attribute edd_attr_##_name = { 	\
-	.attr = {.name = __stringify(_name), .mode = _mode, .owner = THIS_MODULE },	\
+	.attr = {.name = __stringify(_name), .mode = _mode },	\
 	.show	= _show,				\
 	.test	= _test,				\
 };
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index 1324984a4c35..bfd2d67df689 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -131,21 +131,21 @@ struct efivar_attribute {
 
 #define EFI_ATTR(_name, _mode, _show, _store) \
 struct subsys_attribute efi_attr_##_name = { \
-	.attr = {.name = __stringify(_name), .mode = _mode, .owner = THIS_MODULE}, \
+	.attr = {.name = __stringify(_name), .mode = _mode}, \
 	.show = _show, \
 	.store = _store, \
 };
 
 #define EFIVAR_ATTR(_name, _mode, _show, _store) \
 struct efivar_attribute efivar_attr_##_name = { \
-	.attr = {.name = __stringify(_name), .mode = _mode, .owner = THIS_MODULE}, \
+	.attr = {.name = __stringify(_name), .mode = _mode}, \
 	.show = _show, \
 	.store = _store, \
 };
 
 #define VAR_SUBSYS_ATTR(_name, _mode, _show, _store) \
 struct subsys_attribute var_subsys_attr_##_name = { \
-	.attr = {.name = __stringify(_name), .mode = _mode, .owner = THIS_MODULE}, \
+	.attr = {.name = __stringify(_name), .mode = _mode}, \
 	.show = _show, \
 	.store = _store, \
 };
diff --git a/drivers/i2c/chips/eeprom.c b/drivers/i2c/chips/eeprom.c
index bfce13c8f1ff..5990dd5fc773 100644
--- a/drivers/i2c/chips/eeprom.c
+++ b/drivers/i2c/chips/eeprom.c
@@ -143,7 +143,6 @@ static struct bin_attribute eeprom_attr = {
 	.attr = {
 		.name = "eeprom",
 		.mode = S_IRUGO,
-		.owner = THIS_MODULE,
 	},
 	.size = EEPROM_SIZE,
 	.read = eeprom_read,
diff --git a/drivers/i2c/chips/max6875.c b/drivers/i2c/chips/max6875.c
index 76645c142977..1405ce5b236c 100644
--- a/drivers/i2c/chips/max6875.c
+++ b/drivers/i2c/chips/max6875.c
@@ -152,7 +152,6 @@ static struct bin_attribute user_eeprom_attr = {
 	.attr = {
 		.name = "eeprom",
 		.mode = S_IRUGO,
-		.owner = THIS_MODULE,
 	},
 	.size = USER_EEPROM_SIZE,
 	.read = max6875_read,
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 08c299ebf4a8..bf9b99292048 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -479,7 +479,6 @@ alloc_group_attrs(ssize_t (*show)(struct ib_port *,
 
 		element->attr.attr.name  = element->name;
 		element->attr.attr.mode  = S_IRUGO;
-		element->attr.attr.owner = THIS_MODULE;
 		element->attr.show       = show;
 		element->index		 = i;
 
diff --git a/drivers/input/mouse/psmouse.h b/drivers/input/mouse/psmouse.h
index 27a68835b5ba..1317bdd8cc7c 100644
--- a/drivers/input/mouse/psmouse.h
+++ b/drivers/input/mouse/psmouse.h
@@ -119,7 +119,6 @@ static struct psmouse_attribute psmouse_attr_##_name = {			\
 		.attr	= {							\
 			.name	= __stringify(_name),				\
 			.mode	= _mode,					\
-			.owner	= THIS_MODULE,					\
 		},								\
 		.show	= psmouse_attr_show_helper,				\
 		.store	= psmouse_attr_set_helper,				\
diff --git a/drivers/macintosh/windfarm_core.c b/drivers/macintosh/windfarm_core.c
index 11ced17f438a..4fcb245ba184 100644
--- a/drivers/macintosh/windfarm_core.c
+++ b/drivers/macintosh/windfarm_core.c
@@ -212,7 +212,6 @@ int wf_register_control(struct wf_control *new_ct)
 	list_add(&new_ct->link, &wf_controls);
 
 	new_ct->attr.attr.name = new_ct->name;
-	new_ct->attr.attr.owner = THIS_MODULE;
 	new_ct->attr.attr.mode = 0644;
 	new_ct->attr.show = wf_show_control;
 	new_ct->attr.store = wf_store_control;
@@ -325,7 +324,6 @@ int wf_register_sensor(struct wf_sensor *new_sr)
 	list_add(&new_sr->link, &wf_sensors);
 
 	new_sr->attr.attr.name = new_sr->name;
-	new_sr->attr.attr.owner = THIS_MODULE;
 	new_sr->attr.attr.mode = 0444;
 	new_sr->attr.show = wf_show_sensor;
 	new_sr->attr.store = NULL;
diff --git a/drivers/misc/asus-laptop.c b/drivers/misc/asus-laptop.c
index 4f9060a2a2f2..7798f590e5aa 100644
--- a/drivers/misc/asus-laptop.c
+++ b/drivers/misc/asus-laptop.c
@@ -737,8 +737,7 @@ static void asus_hotk_notify(acpi_handle handle, u32 event, void *data)
 	struct device_attribute dev_attr_##_name = {			\
 		.attr = {						\
 			.name = __stringify(_name),			\
-			.mode = 0,					\
-			.owner = THIS_MODULE },				\
+			.mode = 0 },					\
 		.show   = NULL,						\
 		.store  = NULL,						\
 	}
diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c
index 6ec3d500f334..d96eb7229548 100644
--- a/drivers/net/ibmveth.c
+++ b/drivers/net/ibmveth.c
@@ -1337,7 +1337,7 @@ const char * buf, size_t count)
 
 #define ATTR(_name, _mode)      \
         struct attribute veth_##_name##_attr = {               \
-        .name = __stringify(_name), .mode = _mode, .owner = THIS_MODULE \
+        .name = __stringify(_name), .mode = _mode, \
         };
 
 static ATTR(active, 0644);
diff --git a/drivers/parisc/pdc_stable.c b/drivers/parisc/pdc_stable.c
index 924ef0609460..fc4bde259dc7 100644
--- a/drivers/parisc/pdc_stable.c
+++ b/drivers/parisc/pdc_stable.c
@@ -121,14 +121,14 @@ struct pdcspath_entry pdcspath_entry_##_name = { \
 
 #define PDCS_ATTR(_name, _mode, _show, _store) \
 struct subsys_attribute pdcs_attr_##_name = { \
-	.attr = {.name = __stringify(_name), .mode = _mode, .owner = THIS_MODULE}, \
+	.attr = {.name = __stringify(_name), .mode = _mode}, \
 	.show = _show, \
 	.store = _store, \
 };
 
 #define PATHS_ATTR(_name, _mode, _show, _store) \
 struct pdcspath_attribute paths_attr_##_name = { \
-	.attr = {.name = __stringify(_name), .mode = _mode, .owner = THIS_MODULE}, \
+	.attr = {.name = __stringify(_name), .mode = _mode}, \
 	.show = _show, \
 	.store = _store, \
 };
diff --git a/drivers/pci/hotplug/acpiphp_ibm.c b/drivers/pci/hotplug/acpiphp_ibm.c
index e7322c25d377..74556ec31a5b 100644
--- a/drivers/pci/hotplug/acpiphp_ibm.c
+++ b/drivers/pci/hotplug/acpiphp_ibm.c
@@ -117,7 +117,6 @@ static struct notification ibm_note;
 static struct bin_attribute ibm_apci_table_attr = {
 	    .attr = {
 		    .name = "apci_table",
-		    .owner = THIS_MODULE,
 		    .mode = S_IRUGO,
 	    },
 	    .read = ibm_read_apci_table,
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 284e83a527f9..d448f8df8613 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -499,7 +499,6 @@ static int pci_create_resource_files(struct pci_dev *pdev)
 			sprintf(res_attr_name, "resource%d", i);
 			res_attr->attr.name = res_attr_name;
 			res_attr->attr.mode = S_IRUSR | S_IWUSR;
-			res_attr->attr.owner = THIS_MODULE;
 			res_attr->size = pci_resource_len(pdev, i);
 			res_attr->mmap = pci_mmap_resource;
 			res_attr->private = &pdev->resource[i];
@@ -582,7 +581,6 @@ static struct bin_attribute pci_config_attr = {
 	.attr =	{
 		.name = "config",
 		.mode = S_IRUGO | S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = 256,
 	.read = pci_read_config,
@@ -593,7 +591,6 @@ static struct bin_attribute pcie_config_attr = {
 	.attr =	{
 		.name = "config",
 		.mode = S_IRUGO | S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = 4096,
 	.read = pci_read_config,
@@ -628,7 +625,6 @@ int __must_check pci_create_sysfs_dev_files (struct pci_dev *pdev)
 			rom_attr->size = pci_resource_len(pdev, PCI_ROM_RESOURCE);
 			rom_attr->attr.name = "rom";
 			rom_attr->attr.mode = S_IRUSR;
-			rom_attr->attr.owner = THIS_MODULE;
 			rom_attr->read = pci_read_rom;
 			rom_attr->write = pci_write_rom;
 			retval = sysfs_create_bin_file(&pdev->dev.kobj, rom_attr);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index e48fcf089621..08783bd381f5 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -39,7 +39,6 @@ static void pci_create_legacy_files(struct pci_bus *b)
 		b->legacy_io->attr.name = "legacy_io";
 		b->legacy_io->size = 0xffff;
 		b->legacy_io->attr.mode = S_IRUSR | S_IWUSR;
-		b->legacy_io->attr.owner = THIS_MODULE;
 		b->legacy_io->read = pci_read_legacy_io;
 		b->legacy_io->write = pci_write_legacy_io;
 		class_device_create_bin_file(&b->class_dev, b->legacy_io);
@@ -49,7 +48,6 @@ static void pci_create_legacy_files(struct pci_bus *b)
 		b->legacy_mem->attr.name = "legacy_mem";
 		b->legacy_mem->size = 1024*1024;
 		b->legacy_mem->attr.mode = S_IRUSR | S_IWUSR;
-		b->legacy_mem->attr.owner = THIS_MODULE;
 		b->legacy_mem->mmap = pci_mmap_legacy_mem;
 		class_device_create_bin_file(&b->class_dev, b->legacy_mem);
 	}
diff --git a/drivers/pcmcia/socket_sysfs.c b/drivers/pcmcia/socket_sysfs.c
index a2bb46526b56..dbfbe65779e5 100644
--- a/drivers/pcmcia/socket_sysfs.c
+++ b/drivers/pcmcia/socket_sysfs.c
@@ -366,7 +366,7 @@ static struct device_attribute *pccard_socket_attributes[] = {
 };
 
 static struct bin_attribute pccard_cis_attr = {
-	.attr = { .name = "cis", .mode = S_IRUGO | S_IWUSR, .owner = THIS_MODULE},
+	.attr = { .name = "cis", .mode = S_IRUGO | S_IWUSR },
 	.size = 0x200,
 	.read = pccard_show_cis,
 	.write = pccard_store_cis,
diff --git a/drivers/rapidio/rio-sysfs.c b/drivers/rapidio/rio-sysfs.c
index eed91434417d..a3972b9f96e6 100644
--- a/drivers/rapidio/rio-sysfs.c
+++ b/drivers/rapidio/rio-sysfs.c
@@ -197,7 +197,6 @@ static struct bin_attribute rio_config_attr = {
 	.attr = {
 		 .name = "config",
 		 .mode = S_IRUGO | S_IWUSR,
-		 .owner = THIS_MODULE,
 		 },
 	.size = 0x200000,
 	.read = rio_read_config,
diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c
index afa64c7fa2e2..b024cfb558f4 100644
--- a/drivers/rtc/rtc-ds1553.c
+++ b/drivers/rtc/rtc-ds1553.c
@@ -290,7 +290,6 @@ static struct bin_attribute ds1553_nvram_attr = {
 	.attr = {
 		.name = "nvram",
 		.mode = S_IRUGO | S_IWUGO,
-		.owner = THIS_MODULE,
 	},
 	.size = RTC_OFFSET,
 	.read = ds1553_nvram_read,
diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c
index d68288b389dc..1638acdbc913 100644
--- a/drivers/rtc/rtc-ds1742.c
+++ b/drivers/rtc/rtc-ds1742.c
@@ -159,7 +159,6 @@ static struct bin_attribute ds1742_nvram_attr = {
 	.attr = {
 		.name = "nvram",
 		.mode = S_IRUGO | S_IWUGO,
-		.owner = THIS_MODULE,
 	},
 	.read = ds1742_nvram_read,
 	.write = ds1742_nvram_write,
diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c
index ac289e6eadfe..96a8a72a6083 100644
--- a/drivers/s390/cio/chp.c
+++ b/drivers/s390/cio/chp.c
@@ -165,7 +165,6 @@ static struct bin_attribute chp_measurement_chars_attr = {
 	.attr = {
 		.name = "measurement_chars",
 		.mode = S_IRUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = sizeof(struct cmg_chars),
 	.read = chp_measurement_chars_read,
@@ -217,7 +216,6 @@ static struct bin_attribute chp_measurement_attr = {
 	.attr = {
 		.name = "measurement",
 		.mode = S_IRUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = sizeof(struct cmg_entry),
 	.read = chp_measurement_read,
diff --git a/drivers/s390/net/qeth_sys.c b/drivers/s390/net/qeth_sys.c
index 65ffc21afc37..bb0287ad1aac 100644
--- a/drivers/s390/net/qeth_sys.c
+++ b/drivers/s390/net/qeth_sys.c
@@ -991,7 +991,7 @@ static struct attribute_group qeth_osn_device_attr_group = {
 
 #define QETH_DEVICE_ATTR(_id,_name,_mode,_show,_store)			     \
 struct device_attribute dev_attr_##_id = {				     \
-	.attr = {.name=__stringify(_name), .mode=_mode, .owner=THIS_MODULE },\
+	.attr = {.name=__stringify(_name), .mode=_mode, },\
 	.show	= _show,						     \
 	.store	= _store,						     \
 };
diff --git a/drivers/scsi/arcmsr/arcmsr_attr.c b/drivers/scsi/arcmsr/arcmsr_attr.c
index 03bfed61bffc..8908228bc134 100644
--- a/drivers/scsi/arcmsr/arcmsr_attr.c
+++ b/drivers/scsi/arcmsr/arcmsr_attr.c
@@ -188,7 +188,6 @@ static struct bin_attribute arcmsr_sysfs_message_read_attr = {
 	.attr = {
 		.name = "mu_read",
 		.mode = S_IRUSR ,
-		.owner = THIS_MODULE,
 	},
 	.size = 1032,
 	.read = arcmsr_sysfs_iop_message_read,
@@ -198,7 +197,6 @@ static struct bin_attribute arcmsr_sysfs_message_write_attr = {
 	.attr = {
 		.name = "mu_write",
 		.mode = S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = 1032,
 	.write = arcmsr_sysfs_iop_message_write,
@@ -208,7 +206,6 @@ static struct bin_attribute arcmsr_sysfs_message_clear_attr = {
 	.attr = {
 		.name = "mu_clear",
 		.mode = S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = 1,
 	.write = arcmsr_sysfs_iop_message_clear,
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index e34442e405e8..578ed79f4148 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -1368,7 +1368,6 @@ static void sas_ex_smp_hook(struct domain_device *dev)
 	memset(bin_attr, 0, sizeof(*bin_attr));
 
 	bin_attr->attr.name = SMP_BIN_ATTR_NAME;
-	bin_attr->attr.owner = THIS_MODULE;
 	bin_attr->attr.mode = 0600;
 
 	bin_attr->size = 0;
diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index 95fe77e816f8..f81fe501a4a1 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -1200,7 +1200,6 @@ static struct bin_attribute sysfs_ctlreg_attr = {
 	.attr = {
 		.name = "ctlreg",
 		.mode = S_IRUSR | S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = 256,
 	.read = sysfs_ctlreg_read,
@@ -1422,7 +1421,6 @@ static struct bin_attribute sysfs_mbox_attr = {
 	.attr = {
 		.name = "mbox",
 		.mode = S_IRUSR | S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = MAILBOX_CMD_SIZE,
 	.read = sysfs_mbox_read,
diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c
index 8081b637d97e..96587253bfa9 100644
--- a/drivers/scsi/qla2xxx/qla_attr.c
+++ b/drivers/scsi/qla2xxx/qla_attr.c
@@ -73,7 +73,6 @@ static struct bin_attribute sysfs_fw_dump_attr = {
 	.attr = {
 		.name = "fw_dump",
 		.mode = S_IRUSR | S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = 0,
 	.read = qla2x00_sysfs_read_fw_dump,
@@ -149,7 +148,6 @@ static struct bin_attribute sysfs_nvram_attr = {
 	.attr = {
 		.name = "nvram",
 		.mode = S_IRUSR | S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = 512,
 	.read = qla2x00_sysfs_read_nvram,
@@ -198,7 +196,6 @@ static struct bin_attribute sysfs_optrom_attr = {
 	.attr = {
 		.name = "optrom",
 		.mode = S_IRUSR | S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = OPTROM_SIZE_24XX,
 	.read = qla2x00_sysfs_read_optrom,
@@ -279,7 +276,6 @@ static struct bin_attribute sysfs_optrom_ctl_attr = {
 	.attr = {
 		.name = "optrom_ctl",
 		.mode = S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = 0,
 	.write = qla2x00_sysfs_write_optrom_ctl,
@@ -327,7 +323,6 @@ static struct bin_attribute sysfs_vpd_attr = {
 	.attr = {
 		.name = "vpd",
 		.mode = S_IRUSR | S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = 0,
 	.read = qla2x00_sysfs_read_vpd,
@@ -375,7 +370,6 @@ static struct bin_attribute sysfs_sfp_attr = {
 	.attr = {
 		.name = "sfp",
 		.mode = S_IRUSR | S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = SFP_DEV_SIZE * 2,
 	.read = qla2x00_sysfs_read_sfp,
diff --git a/drivers/spi/at25.c b/drivers/spi/at25.c
index 8efa07e8b8c2..fde1dededba3 100644
--- a/drivers/spi/at25.c
+++ b/drivers/spi/at25.c
@@ -314,7 +314,6 @@ static int at25_probe(struct spi_device *spi)
 	 */
 	at25->bin.attr.name = "eeprom";
 	at25->bin.attr.mode = S_IRUSR;
-	at25->bin.attr.owner = THIS_MODULE;
 	at25->bin.read = at25_bin_read;
 
 	at25->bin.size = at25->chip.byte_len;
diff --git a/drivers/video/aty/radeon_base.c b/drivers/video/aty/radeon_base.c
index 2ce050193018..3b3c6571f583 100644
--- a/drivers/video/aty/radeon_base.c
+++ b/drivers/video/aty/radeon_base.c
@@ -2126,7 +2126,6 @@ static ssize_t radeon_show_edid2(struct kobject *kobj, char *buf, loff_t off, si
 static struct bin_attribute edid1_attr = {
 	.attr   = {
 		.name	= "edid1",
-		.owner	= THIS_MODULE,
 		.mode	= 0444,
 	},
 	.size	= EDID_LENGTH,
@@ -2136,7 +2135,6 @@ static struct bin_attribute edid1_attr = {
 static struct bin_attribute edid2_attr = {
 	.attr   = {
 		.name	= "edid2",
-		.owner	= THIS_MODULE,
 		.mode	= 0444,
 	},
 	.size	= EDID_LENGTH,
diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index c65e81ff3578..7e06223bca94 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@ -172,7 +172,7 @@ static struct class backlight_class = {
 
 #define DECLARE_ATTR(_name,_mode,_show,_store)			\
 {							 	\
-	.attr	= { .name = __stringify(_name), .mode = _mode, .owner = THIS_MODULE },	\
+	.attr	= { .name = __stringify(_name), .mode = _mode }, \
 	.show	= _show,					\
 	.store	= _store,					\
 }
diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c
index 6ef8f0a7a137..648b53c1fdea 100644
--- a/drivers/video/backlight/lcd.c
+++ b/drivers/video/backlight/lcd.c
@@ -157,7 +157,7 @@ static struct class lcd_class = {
 
 #define DECLARE_ATTR(_name,_mode,_show,_store)			\
 {							 	\
-	.attr	= { .name = __stringify(_name), .mode = _mode, .owner = THIS_MODULE },	\
+	.attr	= { .name = __stringify(_name), .mode = _mode }, \
 	.show	= _show,					\
 	.store	= _store,					\
 }
diff --git a/drivers/w1/slaves/w1_ds2433.c b/drivers/w1/slaves/w1_ds2433.c
index 8ea17a53eed8..4e13aa71adea 100644
--- a/drivers/w1/slaves/w1_ds2433.c
+++ b/drivers/w1/slaves/w1_ds2433.c
@@ -252,7 +252,6 @@ static struct bin_attribute w1_f23_bin_attr = {
 	.attr = {
 		.name = "eeprom",
 		.mode = S_IRUGO | S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = W1_EEPROM_SIZE,
 	.read = w1_f23_read_bin,
diff --git a/drivers/w1/slaves/w1_therm.c b/drivers/w1/slaves/w1_therm.c
index 1a6937dc190b..8ba4e572e09c 100644
--- a/drivers/w1/slaves/w1_therm.c
+++ b/drivers/w1/slaves/w1_therm.c
@@ -48,7 +48,6 @@ static struct bin_attribute w1_therm_bin_attr = {
 	.attr = {
 		.name = "w1_slave",
 		.mode = S_IRUGO,
-		.owner = THIS_MODULE,
 	},
 	.size = W1_SLAVE_DATA_SIZE,
 	.read = w1_therm_read_bin,
diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c
index 7d6876dbcc96..1838cb29b646 100644
--- a/drivers/w1/w1.c
+++ b/drivers/w1/w1.c
@@ -128,7 +128,6 @@ static struct bin_attribute w1_slave_attr_bin_id = {
       .attr = {
               .name = "id",
               .mode = S_IRUGO,
-              .owner = THIS_MODULE,
       },
       .size = 8,
       .read = w1_slave_read_id,
@@ -167,7 +166,6 @@ static struct bin_attribute w1_default_attr = {
       .attr = {
               .name = "rw",
               .mode = S_IRUGO | S_IWUSR,
-              .owner = THIS_MODULE,
       },
       .size = PAGE_SIZE,
       .read = w1_default_read,
diff --git a/drivers/zorro/zorro-sysfs.c b/drivers/zorro/zorro-sysfs.c
index c3ba0ec334c4..7e03cc68b182 100644
--- a/drivers/zorro/zorro-sysfs.c
+++ b/drivers/zorro/zorro-sysfs.c
@@ -78,7 +78,6 @@ static struct bin_attribute zorro_config_attr = {
 	.attr =	{
 		.name = "config",
 		.mode = S_IRUGO | S_IWUSR,
-		.owner = THIS_MODULE
 	},
 	.size = sizeof(struct ConfigDev),
 	.read = zorro_read_config,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 606128f5c927..02ca6f1e55d7 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -840,8 +840,6 @@ static int __init ecryptfs_init(void)
 		goto out;
 	}
 	kobj_set_kset_s(&ecryptfs_subsys, fs_subsys);
-	sysfs_attr_version.attr.owner = THIS_MODULE;
-	sysfs_attr_version_str.attr.owner = THIS_MODULE;
 	rc = do_sysfs_registration();
 	if (rc) {
 		printk(KERN_ERR "sysfs registration failed\n");
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 2b205f5d5790..e9e042b93dbf 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -74,7 +74,6 @@ struct mlog_attribute {
 #define define_mask(_name) {			\
 	.attr = {				\
 		.name = #_name,			\
-		.owner = THIS_MODULE,		\
 		.mode = S_IRUGO | S_IWUSR,	\
 	},					\
 	.mask = ML_##_name,			\
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 9a3a058f3553..98e0b85a9bb2 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -397,7 +397,6 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
 		static struct attribute addpartattr = {
 			.name = "whole_disk",
 			.mode = S_IRUSR | S_IRGRP | S_IROTH,
-			.owner = THIS_MODULE,
 		};
 
 		sysfs_create_file(&p->kobj, &addpartattr);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 618b8aea6a7b..3c5574a40b09 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -175,25 +175,20 @@ static int open(struct inode * inode, struct file * file)
 	if (!sysfs_get_active(attr_sd))
 		return -ENODEV;
 
-	/* Grab the module reference for this attribute */
-	error = -ENODEV;
-	if (!try_module_get(attr->attr.owner))
-		goto err_sput;
-
 	error = -EACCES;
 	if ((file->f_mode & FMODE_WRITE) && !(attr->write || attr->mmap))
-		goto err_mput;
+		goto err_out;
 	if ((file->f_mode & FMODE_READ) && !(attr->read || attr->mmap))
-		goto err_mput;
+		goto err_out;
 
 	error = -ENOMEM;
 	bb = kzalloc(sizeof(*bb), GFP_KERNEL);
 	if (!bb)
-		goto err_mput;
+		goto err_out;
 
 	bb->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!bb->buffer)
-		goto err_mput;
+		goto err_out;
 
 	mutex_init(&bb->mutex);
 	file->private_data = bb;
@@ -203,9 +198,7 @@ static int open(struct inode * inode, struct file * file)
 	sysfs_get(attr_sd);
 	return 0;
 
- err_mput:
-	module_put(attr->attr.owner);
- err_sput:
+ err_out:
 	sysfs_put_active(attr_sd);
 	kfree(bb);
 	return error;
@@ -214,13 +207,11 @@ static int open(struct inode * inode, struct file * file)
 static int release(struct inode * inode, struct file * file)
 {
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr;
 	struct bin_buffer *bb = file->private_data;
 
 	if (bb->mmapped)
 		sysfs_put_active_two(attr_sd);
 	sysfs_put(attr_sd);
-	module_put(attr->attr.owner);
 	kfree(bb->buffer);
 	kfree(bb);
 	return 0;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index d673d9b5d33f..a84b734f7b29 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -241,7 +241,6 @@ sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t
 static int sysfs_open_file(struct inode *inode, struct file *file)
 {
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct attribute *attr = attr_sd->s_elem.attr.attr;
 	struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj;
 	struct sysfs_buffer * buffer;
 	struct sysfs_ops * ops = NULL;
@@ -251,11 +250,6 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	if (!sysfs_get_active_two(attr_sd))
 		return -ENODEV;
 
-	/* Grab the module reference for this attribute */
-	error = -ENODEV;
-	if (!try_module_get(attr->owner))
-		goto err_sput;
-
 	/* if the kobject has no ktype, then we assume that it is a subsystem
 	 * itself, and use ops for it.
 	 */
@@ -272,7 +266,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	 * or the subsystem have no operations.
 	 */
 	if (!ops)
-		goto err_mput;
+		goto err_out;
 
 	/* File needs write support.
 	 * The inode's perms must say it's ok, 
@@ -280,7 +274,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	 */
 	if (file->f_mode & FMODE_WRITE) {
 		if (!(inode->i_mode & S_IWUGO) || !ops->store)
-			goto err_mput;
+			goto err_out;
 	}
 
 	/* File needs read support.
@@ -289,7 +283,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	 */
 	if (file->f_mode & FMODE_READ) {
 		if (!(inode->i_mode & S_IRUGO) || !ops->show)
-			goto err_mput;
+			goto err_out;
 	}
 
 	/* No error? Great, allocate a buffer for the file, and store it
@@ -298,7 +292,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	error = -ENOMEM;
 	buffer = kzalloc(sizeof(struct sysfs_buffer), GFP_KERNEL);
 	if (!buffer)
-		goto err_mput;
+		goto err_out;
 
 	init_MUTEX(&buffer->sem);
 	buffer->needs_read_fill = 1;
@@ -310,9 +304,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	sysfs_get(attr_sd);
 	return 0;
 
- err_mput:
-	module_put(attr->owner);
- err_sput:
+ err_out:
 	sysfs_put_active_two(attr_sd);
 	return error;
 }
@@ -320,12 +312,9 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 static int sysfs_release(struct inode * inode, struct file * filp)
 {
 	struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
-	struct attribute *attr = attr_sd->s_elem.attr.attr;
 	struct sysfs_buffer *buffer = filp->private_data;
 
 	sysfs_put(attr_sd);
-	/* After this point, attr should not be accessed. */
-	module_put(attr->owner);
 
 	if (buffer) {
 		if (buffer->page)
diff --git a/include/linux/sysdev.h b/include/linux/sysdev.h
index e699ab279c2c..e285746588d6 100644
--- a/include/linux/sysdev.h
+++ b/include/linux/sysdev.h
@@ -101,8 +101,7 @@ struct sysdev_attribute {
 
 #define _SYSDEV_ATTR(_name,_mode,_show,_store)			\
 {								\
-	.attr = { .name = __stringify(_name), .mode = _mode,	\
-		 .owner = THIS_MODULE },			\
+	.attr = { .name = __stringify(_name), .mode = _mode },	\
 	.show	= _show,					\
 	.store	= _store,					\
 }
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 2f86b080b39d..161e19aa2b4f 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -20,9 +20,13 @@ struct module;
 struct nameidata;
 struct dentry;
 
+/* FIXME
+ * The *owner field is no longer used, but leave around
+ * until the tree gets cleaned up fully.
+ */
 struct attribute {
 	const char		* name;
-	struct module 		* owner;
+	struct module		* owner;
 	mode_t			mode;
 };
 
@@ -39,14 +43,14 @@ struct attribute_group {
  */
 
 #define __ATTR(_name,_mode,_show,_store) { \
-	.attr = {.name = __stringify(_name), .mode = _mode, .owner = THIS_MODULE },	\
+	.attr = {.name = __stringify(_name), .mode = _mode },	\
 	.show	= _show,					\
 	.store	= _store,					\
 }
 
 #define __ATTR_RO(_name) { \
-	.attr	= { .name = __stringify(_name), .mode = 0444, .owner = THIS_MODULE },	\
-	.show	= _name##_show,	\
+	.attr	= { .name = __stringify(_name), .mode = 0444 },	\
+	.show	= _name##_show,					\
 }
 
 #define __ATTR_NULL { .attr = { .name = NULL } }
diff --git a/kernel/module.c b/kernel/module.c
index 9bd93de01f4a..015d60cfd90e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -488,8 +488,7 @@ static void free_modinfo_##field(struct module *mod)                  \
         mod->field = NULL;                                            \
 }                                                                     \
 static struct module_attribute modinfo_##field = {                    \
-	.attr = { .name = __stringify(field), .mode = 0444,           \
-		  .owner = THIS_MODULE },                             \
+	.attr = { .name = __stringify(field), .mode = 0444 },         \
 	.show = show_modinfo_##field,                                 \
 	.setup = setup_modinfo_##field,                               \
 	.test = modinfo_##field##_exists,                             \
@@ -793,7 +792,7 @@ static ssize_t show_refcnt(struct module_attribute *mattr,
 }
 
 static struct module_attribute refcnt = {
-	.attr = { .name = "refcnt", .mode = 0444, .owner = THIS_MODULE },
+	.attr = { .name = "refcnt", .mode = 0444 },
 	.show = show_refcnt,
 };
 
@@ -851,7 +850,7 @@ static ssize_t show_initstate(struct module_attribute *mattr,
 }
 
 static struct module_attribute initstate = {
-	.attr = { .name = "initstate", .mode = 0444, .owner = THIS_MODULE },
+	.attr = { .name = "initstate", .mode = 0444 },
 	.show = show_initstate,
 };
 
@@ -1032,7 +1031,6 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
 		sattr->mattr.show = module_sect_show;
 		sattr->mattr.store = NULL;
 		sattr->mattr.attr.name = sattr->name;
-		sattr->mattr.attr.owner = mod;
 		sattr->mattr.attr.mode = S_IRUGO;
 		*(gattr++) = &(sattr++)->mattr.attr;
 	}
@@ -1090,7 +1088,6 @@ int module_add_modinfo_attrs(struct module *mod)
 		if (!attr->test ||
 		    (attr->test && attr->test(mod))) {
 			memcpy(temp_attr, attr, sizeof(*temp_attr));
-			temp_attr->attr.owner = mod;
 			error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
 			++temp_attr;
 		}
diff --git a/kernel/params.c b/kernel/params.c
index e61c46c97ce7..effbaaedd7f3 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -491,7 +491,6 @@ param_sysfs_setup(struct module_kobject *mk,
 			pattr->mattr.show = param_attr_show;
 			pattr->mattr.store = param_attr_store;
 			pattr->mattr.attr.name = (char *)&kp->name[name_skip];
-			pattr->mattr.attr.owner = mk->mod;
 			pattr->mattr.attr.mode = kp->perm;
 			*(gattr++) = &(pattr++)->mattr.attr;
 		}
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 33c6c4a7c689..31ace23a0914 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -383,8 +383,7 @@ static ssize_t brforward_read(struct kobject *kobj, char *buf,
 
 static struct bin_attribute bridge_forward = {
 	.attr = { .name = SYSFS_BRIDGE_FDB,
-		  .mode = S_IRUGO,
-		  .owner = THIS_MODULE, },
+		  .mode = S_IRUGO, },
 	.read = brforward_read,
 };
 
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 2da22927d8dd..79db51fcb476 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -29,8 +29,7 @@ struct brport_attribute {
 #define BRPORT_ATTR(_name,_mode,_show,_store)		        \
 struct brport_attribute brport_attr_##_name = { 	        \
 	.attr = {.name = __stringify(_name), 			\
-		 .mode = _mode, 				\
-		 .owner = THIS_MODULE, },			\
+		 .mode = _mode },				\
 	.show	= _show,					\
 	.store	= _store,					\
 };
-- 
cgit v1.3-8-gc7d7


From ad6a1e1c66009ba9dcd2f5c90ffa1fb4ce72fce0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 14 Jun 2007 03:45:17 +0900
Subject: driver-core: make devt_attr and uevent_attr static

devt_attr and uevent_attr are either allocated dynamically with or
embedded in device and class_device as they needed their owner field
set to the module implementing the driver.  Now that sysfs implements
immediate disconnect and owner field removed from struct attribute,
there is no reason to do this.  Remove these attributes from
[class_]device and use static attribute structures instead.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/class.c   | 44 ++++++++++++++++----------------------------
 drivers/base/core.c    | 45 +++++++++++++++------------------------------
 include/linux/device.h |  5 -----
 3 files changed, 31 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index 9cbfde23b9e3..4d2222618b78 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -312,9 +312,6 @@ static void class_dev_release(struct kobject * kobj)
 
 	pr_debug("device class '%s': release.\n", cd->class_id);
 
-	kfree(cd->devt_attr);
-	cd->devt_attr = NULL;
-
 	if (cd->release)
 		cd->release(cd);
 	else if (cls->release)
@@ -547,6 +544,9 @@ static ssize_t show_dev(struct class_device *class_dev, char *buf)
 	return print_dev_t(buf, class_dev->devt);
 }
 
+static struct class_device_attribute class_devt_attr =
+	__ATTR(dev, S_IRUGO, show_dev, NULL);
+
 static ssize_t store_uevent(struct class_device *class_dev,
 			    const char *buf, size_t count)
 {
@@ -554,6 +554,9 @@ static ssize_t store_uevent(struct class_device *class_dev,
 	return count;
 }
 
+static struct class_device_attribute class_uevent_attr =
+	__ATTR(uevent, S_IWUSR, NULL, store_uevent);
+
 void class_device_initialize(struct class_device *class_dev)
 {
 	kobj_set_kset_s(class_dev, class_obj_subsys);
@@ -603,30 +606,15 @@ int class_device_add(struct class_device *class_dev)
 				  &parent_class->subsys.kobj, "subsystem");
 	if (error)
 		goto out3;
-	class_dev->uevent_attr.attr.name = "uevent";
-	class_dev->uevent_attr.attr.mode = S_IWUSR;
-	class_dev->uevent_attr.store = store_uevent;
-	error = class_device_create_file(class_dev, &class_dev->uevent_attr);
+
+	error = class_device_create_file(class_dev, &class_uevent_attr);
 	if (error)
 		goto out3;
 
 	if (MAJOR(class_dev->devt)) {
-		struct class_device_attribute *attr;
-		attr = kzalloc(sizeof(*attr), GFP_KERNEL);
-		if (!attr) {
-			error = -ENOMEM;
-			goto out4;
-		}
-		attr->attr.name = "dev";
-		attr->attr.mode = S_IRUGO;
-		attr->show = show_dev;
-		error = class_device_create_file(class_dev, attr);
-		if (error) {
-			kfree(attr);
+		error = class_device_create_file(class_dev, &class_devt_attr);
+		if (error)
 			goto out4;
-		}
-
-		class_dev->devt_attr = attr;
 	}
 
 	error = class_device_add_attrs(class_dev);
@@ -669,10 +657,10 @@ int class_device_add(struct class_device *class_dev)
  out6:
 	class_device_remove_attrs(class_dev);
  out5:
-	if (class_dev->devt_attr)
-		class_device_remove_file(class_dev, class_dev->devt_attr);
+	if (MAJOR(class_dev->devt))
+		class_device_remove_file(class_dev, &class_devt_attr);
  out4:
-	class_device_remove_file(class_dev, &class_dev->uevent_attr);
+	class_device_remove_file(class_dev, &class_uevent_attr);
  out3:
 	kobject_del(&class_dev->kobj);
  out2:
@@ -772,9 +760,9 @@ void class_device_del(struct class_device *class_dev)
 		sysfs_remove_link(&class_dev->kobj, "device");
 	}
 	sysfs_remove_link(&class_dev->kobj, "subsystem");
-	class_device_remove_file(class_dev, &class_dev->uevent_attr);
-	if (class_dev->devt_attr)
-		class_device_remove_file(class_dev, class_dev->devt_attr);
+	class_device_remove_file(class_dev, &class_uevent_attr);
+	if (MAJOR(class_dev->devt))
+		class_device_remove_file(class_dev, &class_devt_attr);
 	class_device_remove_attrs(class_dev);
 	class_device_remove_groups(class_dev);
 
diff --git a/drivers/base/core.c b/drivers/base/core.c
index e3fb87bfc6e1..0455aa78fa13 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -310,6 +310,9 @@ static ssize_t store_uevent(struct device *dev, struct device_attribute *attr,
 	return count;
 }
 
+static struct device_attribute uevent_attr =
+	__ATTR(uevent, S_IRUGO | S_IWUSR, show_uevent, store_uevent);
+
 static int device_add_attributes(struct device *dev,
 				 struct device_attribute *attrs)
 {
@@ -423,6 +426,9 @@ static ssize_t show_dev(struct device *dev, struct device_attribute *attr,
 	return print_dev_t(buf, dev->devt);
 }
 
+static struct device_attribute devt_attr =
+	__ATTR(dev, S_IRUGO, show_dev, NULL);
+
 /*
  *	devices_subsys - structure to be registered with kobject core.
  */
@@ -681,31 +687,14 @@ int device_add(struct device *dev)
 		blocking_notifier_call_chain(&dev->bus->bus_notifier,
 					     BUS_NOTIFY_ADD_DEVICE, dev);
 
-	dev->uevent_attr.attr.name = "uevent";
-	dev->uevent_attr.attr.mode = S_IRUGO | S_IWUSR;
-	dev->uevent_attr.store = store_uevent;
-	dev->uevent_attr.show = show_uevent;
-	error = device_create_file(dev, &dev->uevent_attr);
+	error = device_create_file(dev, &uevent_attr);
 	if (error)
 		goto attrError;
 
 	if (MAJOR(dev->devt)) {
-		struct device_attribute *attr;
-		attr = kzalloc(sizeof(*attr), GFP_KERNEL);
-		if (!attr) {
-			error = -ENOMEM;
-			goto ueventattrError;
-		}
-		attr->attr.name = "dev";
-		attr->attr.mode = S_IRUGO;
-		attr->show = show_dev;
-		error = device_create_file(dev, attr);
-		if (error) {
-			kfree(attr);
+		error = device_create_file(dev, &devt_attr);
+		if (error)
 			goto ueventattrError;
-		}
-
-		dev->devt_attr = attr;
 	}
 
 	if (dev->class) {
@@ -766,10 +755,8 @@ int device_add(struct device *dev)
 					     BUS_NOTIFY_DEL_DEVICE, dev);
 	device_remove_attrs(dev);
  AttrsError:
-	if (dev->devt_attr) {
-		device_remove_file(dev, dev->devt_attr);
-		kfree(dev->devt_attr);
-	}
+	if (MAJOR(dev->devt))
+		device_remove_file(dev, &devt_attr);
 
 	if (dev->class) {
 		sysfs_remove_link(&dev->kobj, "subsystem");
@@ -791,7 +778,7 @@ int device_add(struct device *dev)
 		}
 	}
  ueventattrError:
-	device_remove_file(dev, &dev->uevent_attr);
+	device_remove_file(dev, &uevent_attr);
  attrError:
 	kobject_uevent(&dev->kobj, KOBJ_REMOVE);
 	kobject_del(&dev->kobj);
@@ -868,10 +855,8 @@ void device_del(struct device * dev)
 
 	if (parent)
 		klist_del(&dev->knode_parent);
-	if (dev->devt_attr) {
-		device_remove_file(dev, dev->devt_attr);
-		kfree(dev->devt_attr);
-	}
+	if (MAJOR(dev->devt))
+		device_remove_file(dev, &devt_attr);
 	if (dev->class) {
 		sysfs_remove_link(&dev->kobj, "subsystem");
 		/* If this is not a "fake" compatible device, remove the
@@ -925,7 +910,7 @@ void device_del(struct device * dev)
 			up(&dev->class->sem);
 		}
 	}
-	device_remove_file(dev, &dev->uevent_attr);
+	device_remove_file(dev, &uevent_attr);
 	device_remove_attrs(dev);
 	bus_remove_device(dev);
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 2e1a2988b7e1..be2debed70d2 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -238,7 +238,6 @@ extern int __must_check class_device_create_file(struct class_device *,
  * @devt: for internal use by the driver core only.
  * @node: for internal use by the driver core only.
  * @kobj: for internal use by the driver core only.
- * @devt_attr: for internal use by the driver core only.
  * @groups: optional additional groups to be created
  * @dev: if set, a symlink to the struct device is created in the sysfs
  * directory for this struct class device.
@@ -263,8 +262,6 @@ struct class_device {
 	struct kobject		kobj;
 	struct class		* class;	/* required */
 	dev_t			devt;		/* dev_t, creates the sysfs "dev" */
-	struct class_device_attribute *devt_attr;
-	struct class_device_attribute uevent_attr;
 	struct device		* dev;		/* not necessary, but nice to have */
 	void			* class_data;	/* class-specific data */
 	struct class_device	*parent;	/* parent of this child device, if there is one */
@@ -419,8 +416,6 @@ struct device {
 	struct device_type	*type;
 	unsigned		is_registered:1;
 	unsigned		uevent_suppress:1;
-	struct device_attribute uevent_attr;
-	struct device_attribute *devt_attr;
 
 	struct semaphore	sem;	/* semaphore to synchronize calls to
 					 * its driver.
-- 
cgit v1.3-8-gc7d7


From b402d72cf7b338a074e3c12b305ec79284e18845 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 14 Jun 2007 04:27:21 +0900
Subject: sysfs: rename sysfs_dirent->s_type to s_flags and make room for flags

Rename sysfs_dirent->s_type to s_flags, pack type into lower eight
bits and reserve the rest for flags.  sysfs_type() can used to access
the type.  All existing sd->s_type accesses are converted to use
sysfs_type().  While at it, type test is changed to equality test
instead of bit-and test where appropriate.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c        | 33 ++++++++++++++++++++-------------
 fs/sysfs/inode.c      |  8 ++++----
 fs/sysfs/mount.c      |  2 +-
 fs/sysfs/sysfs.h      |  7 ++++++-
 include/linux/sysfs.h |  3 +++
 5 files changed, 34 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index b4074adbab01..eb9bc0a8717b 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -218,9 +218,9 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
  repeat:
 	parent_sd = sd->s_parent;
 
-	if (sd->s_type & SYSFS_KOBJ_LINK)
+	if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
 		sysfs_put(sd->s_elem.symlink.target_sd);
-	if (sd->s_type & SYSFS_COPY_NAME)
+	if (sysfs_type(sd) & SYSFS_COPY_NAME)
 		kfree(sd->s_name);
 	kfree(sd->s_iattr);
 	sysfs_free_ino(sd->s_ino);
@@ -282,7 +282,7 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
 
 	sd->s_name = name;
 	sd->s_mode = mode;
-	sd->s_type = type;
+	sd->s_flags = type;
 
 	return sd;
 
@@ -330,7 +330,7 @@ int sysfs_dirent_exist(struct sysfs_dirent *parent_sd,
 	struct sysfs_dirent * sd;
 
 	for (sd = parent_sd->s_children; sd; sd = sd->s_sibling) {
-		if (sd->s_type) {
+		if (sysfs_type(sd)) {
 			if (strcmp(sd->s_name, new))
 				continue;
 			else
@@ -446,11 +446,12 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 {
 	struct sysfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
 	struct sysfs_dirent * sd;
+	struct bin_attribute *bin_attr;
 	struct inode *inode;
 	int found = 0;
 
 	for (sd = parent_sd->s_children; sd; sd = sd->s_sibling) {
-		if ((sd->s_type & SYSFS_NOT_PINNED) &&
+		if ((sysfs_type(sd) & SYSFS_NOT_PINNED) &&
 		    !strcmp(sd->s_name, dentry->d_name.name)) {
 			found = 1;
 			break;
@@ -468,16 +469,22 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	if (inode->i_state & I_NEW) {
 		/* initialize inode according to type */
-		if (sd->s_type & SYSFS_KOBJ_ATTR) {
+		switch (sysfs_type(sd)) {
+		case SYSFS_KOBJ_ATTR:
 			inode->i_size = PAGE_SIZE;
 			inode->i_fop = &sysfs_file_operations;
-		} else if (sd->s_type & SYSFS_KOBJ_BIN_ATTR) {
-			struct bin_attribute *bin_attr =
-				sd->s_elem.bin_attr.bin_attr;
+			break;
+		case SYSFS_KOBJ_BIN_ATTR:
+			bin_attr = sd->s_elem.bin_attr.bin_attr;
 			inode->i_size = bin_attr->size;
 			inode->i_fop = &bin_fops;
-		} else if (sd->s_type & SYSFS_KOBJ_LINK)
+			break;
+		case SYSFS_KOBJ_LINK:
 			inode->i_op = &sysfs_symlink_inode_operations;
+			break;
+		default:
+			BUG();
+		}
 	}
 
 	sysfs_instantiate(dentry, inode);
@@ -532,7 +539,7 @@ static void __sysfs_remove_dir(struct dentry *dentry)
 	while (*pos) {
 		struct sysfs_dirent *sd = *pos;
 
-		if (sd->s_type && (sd->s_type & SYSFS_NOT_PINNED)) {
+		if (sysfs_type(sd) && (sysfs_type(sd) & SYSFS_NOT_PINNED)) {
 			*pos = sd->s_sibling;
 			sd->s_sibling = removed;
 			removed = sd;
@@ -775,7 +782,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 				const char * name;
 				int len;
 
-				if (!next->s_type)
+				if (!sysfs_type(next))
 					continue;
 
 				name = next->s_name;
@@ -824,7 +831,7 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
 			pos = &sd->s_children;
 			while (n && *pos) {
 				struct sysfs_dirent *next = *pos;
-				if (next->s_type)
+				if (sysfs_type(next))
 					n--;
 				pos = &(*pos)->s_sibling;
 			}
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 63daa06c4194..ee3a5d957051 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -242,7 +242,7 @@ void sysfs_drop_dentry(struct sysfs_dirent *sd)
 
 	dput(dentry);
 	/* XXX: unpin if directory, this will go away soon */
-	if (sd->s_type & SYSFS_DIR)
+	if (sysfs_type(sd) == SYSFS_DIR)
 		dput(dentry);
 
 	/* adjust nlink and update timestamp */
@@ -254,7 +254,7 @@ void sysfs_drop_dentry(struct sysfs_dirent *sd)
 
 		inode->i_ctime = curtime;
 		drop_nlink(inode);
-		if (sd->s_type & SYSFS_DIR)
+		if (sysfs_type(sd) == SYSFS_DIR)
 			drop_nlink(inode);
 
 		mutex_unlock(&inode->i_mutex);
@@ -267,7 +267,7 @@ void sysfs_drop_dentry(struct sysfs_dirent *sd)
 		mutex_lock(&inode->i_mutex);
 
 		inode->i_ctime = inode->i_mtime = curtime;
-		if (sd->s_type & SYSFS_DIR)
+		if (sysfs_type(sd) == SYSFS_DIR)
 			drop_nlink(inode);
 
 		mutex_unlock(&inode->i_mutex);
@@ -293,7 +293,7 @@ int sysfs_hash_and_remove(struct dentry * dir, const char * name)
 	for (pos = &parent_sd->s_children; *pos; pos = &(*pos)->s_sibling) {
 		sd = *pos;
 
-		if (!sd->s_type)
+		if (!sysfs_type(sd))
 			continue;
 		if (!strcmp(sd->s_name, name)) {
 			*pos = sd->s_sibling;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 4be9593ea000..078537e5d696 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -26,7 +26,7 @@ static const struct super_operations sysfs_ops = {
 
 static struct sysfs_dirent sysfs_root = {
 	.s_count	= ATOMIC_INIT(1),
-	.s_type		= SYSFS_ROOT,
+	.s_flags	= SYSFS_ROOT,
 	.s_mode		= S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
 	.s_ino		= 1,
 };
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 6f8aaf3805d2..06b5085804a1 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -34,7 +34,7 @@ struct sysfs_dirent {
 		struct sysfs_elem_bin_attr	bin_attr;
 	}			s_elem;
 
-	int			s_type;
+	unsigned int		s_flags;
 	umode_t			s_mode;
 	ino_t			s_ino;
 	struct dentry		* s_dentry;
@@ -86,6 +86,11 @@ extern const struct file_operations bin_fops;
 extern const struct inode_operations sysfs_dir_inode_operations;
 extern const struct inode_operations sysfs_symlink_inode_operations;
 
+static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
+{
+	return sd->s_flags & SYSFS_TYPE_MASK;
+}
+
 static inline struct sysfs_dirent * sysfs_get(struct sysfs_dirent * sd)
 {
 	if (sd) {
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 161e19aa2b4f..58135509023e 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -74,6 +74,7 @@ struct sysfs_ops {
 	ssize_t	(*store)(struct kobject *,struct attribute *,const char *, size_t);
 };
 
+#define SYSFS_TYPE_MASK		0x00ff
 #define SYSFS_ROOT		0x0001
 #define SYSFS_DIR		0x0002
 #define SYSFS_KOBJ_ATTR 	0x0004
@@ -82,6 +83,8 @@ struct sysfs_ops {
 #define SYSFS_NOT_PINNED	(SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR | SYSFS_KOBJ_LINK)
 #define SYSFS_COPY_NAME		(SYSFS_DIR | SYSFS_KOBJ_LINK)
 
+#define SYSFS_FLAG_MASK		~SYSFS_TYPE_MASK
+
 #ifdef CONFIG_SYSFS
 
 extern int sysfs_schedule_callback(struct kobject *kobj,
-- 
cgit v1.3-8-gc7d7


From 380e6fbb729a55b73d5d8409551474884e0d93fc Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 14 Jun 2007 04:27:22 +0900
Subject: sysfs: implement SYSFS_FLAG_REMOVED flag

Implement SYSFS_FLAG_REMOVED flag which currently is used only to
improve sanity check in sysfs_deactivate().  The flag will be used to
make directory entries reclamiable.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c        | 4 +++-
 fs/sysfs/inode.c      | 1 +
 include/linux/sysfs.h | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index eb9bc0a8717b..f2ea00683ec9 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -171,7 +171,7 @@ void sysfs_deactivate(struct sysfs_dirent *sd)
 	DECLARE_COMPLETION_ONSTACK(wait);
 	int v;
 
-	BUG_ON(sd->s_sibling);
+	BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED));
 	sd->s_sibling = (void *)&wait;
 
 	/* atomic_add_return() is a mb(), put_active() will always see
@@ -506,6 +506,7 @@ static void remove_dir(struct dentry * d)
 	mutex_lock(&parent->d_inode->i_mutex);
 
 	sysfs_unlink_sibling(sd);
+	sd->s_flags |= SYSFS_FLAG_REMOVED;
 
 	pr_debug(" o %s removing done (%d)\n",d->d_name.name,
 		 atomic_read(&d->d_count));
@@ -540,6 +541,7 @@ static void __sysfs_remove_dir(struct dentry *dentry)
 		struct sysfs_dirent *sd = *pos;
 
 		if (sysfs_type(sd) && (sysfs_type(sd) & SYSFS_NOT_PINNED)) {
+			sd->s_flags |= SYSFS_FLAG_REMOVED;
 			*pos = sd->s_sibling;
 			sd->s_sibling = removed;
 			removed = sd;
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index ee3a5d957051..e2f6ef138d20 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -296,6 +296,7 @@ int sysfs_hash_and_remove(struct dentry * dir, const char * name)
 		if (!sysfs_type(sd))
 			continue;
 		if (!strcmp(sd->s_name, name)) {
+			sd->s_flags |= SYSFS_FLAG_REMOVED;
 			*pos = sd->s_sibling;
 			sd->s_sibling = NULL;
 			found = 1;
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 58135509023e..2a6df6444e69 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -84,6 +84,7 @@ struct sysfs_ops {
 #define SYSFS_COPY_NAME		(SYSFS_DIR | SYSFS_KOBJ_LINK)
 
 #define SYSFS_FLAG_MASK		~SYSFS_TYPE_MASK
+#define SYSFS_FLAG_REMOVED	0x0100
 
 #ifdef CONFIG_SYSFS
 
-- 
cgit v1.3-8-gc7d7


From 608e266a2d4e62c1b98c1c573064b6afe8c06a58 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 14 Jun 2007 04:27:22 +0900
Subject: sysfs: make kobj point to sysfs_dirent instead of dentry

As kobj sysfs dentries and inodes are gonna be made reclaimable,
dentry can't be used as naming token for sysfs file/directory, replace
kobj->dentry with kobj->sd.  The only external interface change is
shadow directory handling.  All other changes are contained in kobj
and sysfs.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/bin.c          |   6 +--
 fs/sysfs/dir.c          | 119 ++++++++++++++++++++++++------------------------
 fs/sysfs/file.c         |  47 +++++++++----------
 fs/sysfs/group.c        |  55 +++++++++++-----------
 fs/sysfs/inode.c        |  11 +++--
 fs/sysfs/symlink.c      |  22 ++++-----
 fs/sysfs/sysfs.h        |  10 ++--
 include/linux/kobject.h |   9 ++--
 include/linux/sysfs.h   |  19 ++++----
 lib/kobject.c           |  10 ++--
 10 files changed, 156 insertions(+), 152 deletions(-)

(limited to 'include/linux')

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 3c5574a40b09..55796bdacd3d 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -234,9 +234,9 @@ const struct file_operations bin_fops = {
 
 int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr)
 {
-	BUG_ON(!kobj || !kobj->dentry || !attr);
+	BUG_ON(!kobj || !kobj->sd || !attr);
 
-	return sysfs_add_file(kobj->dentry, &attr->attr, SYSFS_KOBJ_BIN_ATTR);
+	return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR);
 }
 
 
@@ -248,7 +248,7 @@ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr)
 
 void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr)
 {
-	if (sysfs_hash_and_remove(kobj->dentry, attr->attr.name) < 0) {
+	if (sysfs_hash_and_remove(kobj->sd, attr->attr.name) < 0) {
 		printk(KERN_ERR "%s: "
 			"bad dentry or inode or no such file: \"%s\"\n",
 			__FUNCTION__, attr->attr.name);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 4762a9aa0b27..31b6cf30636d 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -368,9 +368,10 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
 	return sd;
 }
 
-static int create_dir(struct kobject *kobj, struct dentry *parent,
-		      const char *name, struct dentry **p_dentry)
+static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
+		      const char *name, struct sysfs_dirent **p_sd)
 {
+	struct dentry *parent = parent_sd->s_dentry;
 	int error;
 	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
 	struct dentry *dentry;
@@ -409,14 +410,14 @@ static int create_dir(struct kobject *kobj, struct dentry *parent,
 
 	/* link in */
 	error = -EEXIST;
-	if (sysfs_find_dirent(parent->d_fsdata, name))
+	if (sysfs_find_dirent(parent_sd, name))
 		goto out_iput;
 
 	sysfs_instantiate(dentry, inode);
 	inc_nlink(parent->d_inode);
-	sysfs_attach_dirent(sd, parent->d_fsdata, dentry);
+	sysfs_attach_dirent(sd, parent_sd, dentry);
 
-	*p_dentry = dentry;
+	*p_sd = sd;
 	error = 0;
 	goto out_unlock;	/* pin directory dentry in core */
 
@@ -433,38 +434,37 @@ static int create_dir(struct kobject *kobj, struct dentry *parent,
 	return error;
 }
 
-
-int sysfs_create_subdir(struct kobject * k, const char * n, struct dentry ** d)
+int sysfs_create_subdir(struct kobject *kobj, const char *name,
+			struct sysfs_dirent **p_sd)
 {
-	return create_dir(k,k->dentry,n,d);
+	return create_dir(kobj, kobj->sd, name, p_sd);
 }
 
 /**
  *	sysfs_create_dir - create a directory for an object.
  *	@kobj:		object we're creating directory for. 
- *	@shadow_parent:	parent parent object.
+ *	@shadow_parent:	parent object.
  */
-
-int sysfs_create_dir(struct kobject * kobj, struct dentry *shadow_parent)
+int sysfs_create_dir(struct kobject *kobj,
+		     struct sysfs_dirent *shadow_parent_sd)
 {
-	struct dentry * dentry = NULL;
-	struct dentry * parent;
+	struct sysfs_dirent *parent_sd, *sd;
 	int error = 0;
 
 	BUG_ON(!kobj);
 
-	if (shadow_parent)
-		parent = shadow_parent;
+	if (shadow_parent_sd)
+		parent_sd = shadow_parent_sd;
 	else if (kobj->parent)
-		parent = kobj->parent->dentry;
+		parent_sd = kobj->parent->sd;
 	else if (sysfs_mount && sysfs_mount->mnt_sb)
-		parent = sysfs_mount->mnt_sb->s_root;
+		parent_sd = sysfs_mount->mnt_sb->s_root->d_fsdata;
 	else
 		return -EFAULT;
 
-	error = create_dir(kobj,parent,kobject_name(kobj),&dentry);
+	error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd);
 	if (!error)
-		kobj->dentry = dentry;
+		kobj->sd = sd;
 	return error;
 }
 
@@ -525,18 +525,16 @@ const struct inode_operations sysfs_dir_inode_operations = {
 	.setattr	= sysfs_setattr,
 };
 
-static void remove_dir(struct dentry * d)
+static void remove_dir(struct sysfs_dirent *sd)
 {
-	struct dentry *parent = d->d_parent;
-	struct sysfs_dirent *sd = d->d_fsdata;
+	struct dentry *parent = sd->s_parent->s_dentry;
 
 	mutex_lock(&parent->d_inode->i_mutex);
 
 	sysfs_unlink_sibling(sd);
 	sd->s_flags |= SYSFS_FLAG_REMOVED;
 
-	pr_debug(" o %s removing done (%d)\n",d->d_name.name,
-		 atomic_read(&d->d_count));
+	pr_debug(" o %s removing done\n", sd->s_name);
 
 	mutex_unlock(&parent->d_inode->i_mutex);
 
@@ -545,25 +543,26 @@ static void remove_dir(struct dentry * d)
 	sysfs_put(sd);
 }
 
-void sysfs_remove_subdir(struct dentry * d)
+void sysfs_remove_subdir(struct sysfs_dirent *sd)
 {
-	remove_dir(d);
+	remove_dir(sd);
 }
 
 
-static void __sysfs_remove_dir(struct dentry *dentry)
+static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
 {
 	struct sysfs_dirent *removed = NULL;
-	struct sysfs_dirent *parent_sd;
 	struct sysfs_dirent **pos;
+	struct dentry *dir;
 
-	if (!dentry)
+	if (!dir_sd)
 		return;
 
-	pr_debug("sysfs %s: removing dir\n",dentry->d_name.name);
-	mutex_lock(&dentry->d_inode->i_mutex);
-	parent_sd = dentry->d_fsdata;
-	pos = &parent_sd->s_children;
+	dir = dir_sd->s_dentry;
+
+	pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);
+	mutex_lock(&dir->d_inode->i_mutex);
+	pos = &dir_sd->s_children;
 	while (*pos) {
 		struct sysfs_dirent *sd = *pos;
 
@@ -575,7 +574,7 @@ static void __sysfs_remove_dir(struct dentry *dentry)
 		} else
 			pos = &(*pos)->s_sibling;
 	}
-	mutex_unlock(&dentry->d_inode->i_mutex);
+	mutex_unlock(&dir->d_inode->i_mutex);
 
 	while (removed) {
 		struct sysfs_dirent *sd = removed;
@@ -588,7 +587,7 @@ static void __sysfs_remove_dir(struct dentry *dentry)
 		sysfs_put(sd);
 	}
 
-	remove_dir(dentry);
+	remove_dir(dir_sd);
 }
 
 /**
@@ -602,25 +601,25 @@ static void __sysfs_remove_dir(struct dentry *dentry)
 
 void sysfs_remove_dir(struct kobject * kobj)
 {
-	struct dentry *d = kobj->dentry;
+	struct sysfs_dirent *sd = kobj->sd;
 
 	spin_lock(&kobj_sysfs_assoc_lock);
-	kobj->dentry = NULL;
+	kobj->sd = NULL;
 	spin_unlock(&kobj_sysfs_assoc_lock);
 
-	__sysfs_remove_dir(d);
+	__sysfs_remove_dir(sd);
 }
 
-int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent,
+int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd,
 		     const char *new_name)
 {
-	struct sysfs_dirent *sd = kobj->dentry->d_fsdata;
-	struct sysfs_dirent *parent_sd = new_parent->d_fsdata;
+	struct sysfs_dirent *sd = kobj->sd;
+	struct dentry *new_parent = new_parent_sd->s_dentry;
 	struct dentry *new_dentry;
 	char *dup_name;
 	int error;
 
-	if (!new_parent)
+	if (!new_parent_sd)
 		return -EFAULT;
 
 	down_write(&sysfs_rename_sem);
@@ -637,9 +636,9 @@ int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent,
 	 * shadows of the same directory
 	 */
 	error = -EINVAL;
-	if (kobj->dentry->d_parent->d_inode != new_parent->d_inode ||
+	if (sd->s_parent->s_dentry->d_inode != new_parent->d_inode ||
 	    new_dentry->d_parent->d_inode != new_parent->d_inode ||
-	    new_dentry == kobj->dentry)
+	    new_dentry == sd->s_dentry)
 		goto out_dput;
 
 	error = -EEXIST;
@@ -661,12 +660,12 @@ int sysfs_rename_dir(struct kobject * kobj, struct dentry *new_parent,
 
 	/* move under the new parent */
 	d_add(new_dentry, NULL);
-	d_move(kobj->dentry, new_dentry);
+	d_move(sd->s_dentry, new_dentry);
 
 	sysfs_unlink_sibling(sd);
-	sysfs_get(parent_sd);
+	sysfs_get(new_parent_sd);
 	sysfs_put(sd->s_parent);
-	sd->s_parent = parent_sd;
+	sd->s_parent = new_parent_sd;
 	sysfs_link_sibling(sd);
 
 	error = 0;
@@ -691,9 +690,9 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent)
 	int error;
 
 	old_parent_dentry = kobj->parent ?
-		kobj->parent->dentry : sysfs_mount->mnt_sb->s_root;
+		kobj->parent->sd->s_dentry : sysfs_mount->mnt_sb->s_root;
 	new_parent_dentry = new_parent ?
-		new_parent->dentry : sysfs_mount->mnt_sb->s_root;
+		new_parent->sd->s_dentry : sysfs_mount->mnt_sb->s_root;
 
 	if (old_parent_dentry->d_inode == new_parent_dentry->d_inode)
 		return 0;	/* nothing to move */
@@ -705,7 +704,7 @@ again:
 	}
 
 	new_parent_sd = new_parent_dentry->d_fsdata;
-	sd = kobj->dentry->d_fsdata;
+	sd = kobj->sd;
 
 	new_dentry = lookup_one_len(kobj->name, new_parent_dentry,
 				    strlen(kobj->name));
@@ -715,7 +714,7 @@ again:
 	} else
 		error = 0;
 	d_add(new_dentry, NULL);
-	d_move(kobj->dentry, new_dentry);
+	d_move(sd->s_dentry, new_dentry);
 	dput(new_dentry);
 
 	/* Remove from old parent's list and insert into new parent's list. */
@@ -885,7 +884,7 @@ int sysfs_make_shadowed_dir(struct kobject *kobj,
 	struct inode *inode;
 	struct inode_operations *i_op;
 
-	inode = kobj->dentry->d_inode;
+	inode = kobj->sd->s_dentry->d_inode;
 	if (inode->i_op != &sysfs_dir_inode_operations)
 		return -EINVAL;
 
@@ -912,16 +911,16 @@ int sysfs_make_shadowed_dir(struct kobject *kobj,
  *	directory.
  */
 
-struct dentry *sysfs_create_shadow_dir(struct kobject *kobj)
+struct sysfs_dirent *sysfs_create_shadow_dir(struct kobject *kobj)
 {
-	struct dentry *dir = kobj->dentry;
+	struct dentry *dir = kobj->sd->s_dentry;
 	struct inode *inode = dir->d_inode;
 	struct dentry *parent = dir->d_parent;
 	struct sysfs_dirent *parent_sd = parent->d_fsdata;
 	struct dentry *shadow;
 	struct sysfs_dirent *sd;
 
-	shadow = ERR_PTR(-EINVAL);
+	sd = ERR_PTR(-EINVAL);
 	if (!sysfs_is_shadowed_inode(inode))
 		goto out;
 
@@ -944,25 +943,25 @@ struct dentry *sysfs_create_shadow_dir(struct kobject *kobj)
 	dget(shadow);		/* Extra count - pin the dentry in core */
 
 out:
-	return shadow;
+	return sd;
 nomem:
 	dput(shadow);
-	shadow = ERR_PTR(-ENOMEM);
+	sd = ERR_PTR(-ENOMEM);
 	goto out;
 }
 
 /**
  *	sysfs_remove_shadow_dir - remove an object's directory.
- *	@shadow: dentry of shadow directory
+ *	@shadow_sd: sysfs_dirent of shadow directory
  *
  *	The only thing special about this is that we remove any files in
  *	the directory before we remove the directory, and we've inlined
  *	what used to be sysfs_rmdir() below, instead of calling separately.
  */
 
-void sysfs_remove_shadow_dir(struct dentry *shadow)
+void sysfs_remove_shadow_dir(struct sysfs_dirent *shadow_sd)
 {
-	__sysfs_remove_dir(shadow);
+	__sysfs_remove_dir(shadow_sd);
 }
 
 const struct file_operations sysfs_dir_operations = {
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e448b88e313e..20703b9ee064 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -385,7 +385,7 @@ static struct dentry *step_down(struct dentry *dir, const char * name)
 
 void sysfs_notify(struct kobject * k, char *dir, char *attr)
 {
-	struct dentry *de = k->dentry;
+	struct dentry *de = k->sd->s_dentry;
 	if (de)
 		dget(de);
 	if (de && dir)
@@ -412,16 +412,17 @@ const struct file_operations sysfs_file_operations = {
 };
 
 
-int sysfs_add_file(struct dentry * dir, const struct attribute * attr, int type)
+int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
+		   int type)
 {
-	struct sysfs_dirent * parent_sd = dir->d_fsdata;
+	struct dentry *dir = dir_sd->s_dentry;
 	umode_t mode = (attr->mode & S_IALLUGO) | S_IFREG;
 	struct sysfs_dirent *sd;
 	int error = 0;
 
 	mutex_lock(&dir->d_inode->i_mutex);
 
-	if (sysfs_find_dirent(parent_sd, attr->name)) {
+	if (sysfs_find_dirent(dir_sd, attr->name)) {
 		error = -EEXIST;
 		goto out_unlock;
 	}
@@ -432,7 +433,7 @@ int sysfs_add_file(struct dentry * dir, const struct attribute * attr, int type)
 		goto out_unlock;
 	}
 	sd->s_elem.attr.attr = (void *)attr;
-	sysfs_attach_dirent(sd, parent_sd, NULL);
+	sysfs_attach_dirent(sd, dir_sd, NULL);
 
  out_unlock:
 	mutex_unlock(&dir->d_inode->i_mutex);
@@ -448,9 +449,9 @@ int sysfs_add_file(struct dentry * dir, const struct attribute * attr, int type)
 
 int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
 {
-	BUG_ON(!kobj || !kobj->dentry || !attr);
+	BUG_ON(!kobj || !kobj->sd || !attr);
 
-	return sysfs_add_file(kobj->dentry, attr, SYSFS_KOBJ_ATTR);
+	return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR);
 
 }
 
@@ -464,16 +465,16 @@ int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
 int sysfs_add_file_to_group(struct kobject *kobj,
 		const struct attribute *attr, const char *group)
 {
-	struct dentry *dir;
+	struct sysfs_dirent *dir_sd;
 	int error;
 
-	dir = lookup_one_len(group, kobj->dentry, strlen(group));
-	if (IS_ERR(dir))
-		error = PTR_ERR(dir);
-	else {
-		error = sysfs_add_file(dir, attr, SYSFS_KOBJ_ATTR);
-		dput(dir);
-	}
+	dir_sd = sysfs_get_dirent(kobj->sd, group);
+	if (!dir_sd)
+		return -ENOENT;
+
+	error = sysfs_add_file(dir_sd, attr, SYSFS_KOBJ_ATTR);
+	sysfs_put(dir_sd);
+
 	return error;
 }
 EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
@@ -486,7 +487,7 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
  */
 int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
 {
-	struct dentry * dir = kobj->dentry;
+	struct dentry *dir = kobj->sd->s_dentry;
 	struct dentry * victim;
 	int res = -ENOENT;
 
@@ -522,7 +523,7 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
  */
 int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
 {
-	struct dentry *dir = kobj->dentry;
+	struct dentry *dir = kobj->sd->s_dentry;
 	struct dentry *victim;
 	struct inode * inode;
 	struct iattr newattrs;
@@ -560,7 +561,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 
 void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
 {
-	sysfs_hash_and_remove(kobj->dentry, attr->name);
+	sysfs_hash_and_remove(kobj->sd, attr->name);
 }
 
 
@@ -573,12 +574,12 @@ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
 void sysfs_remove_file_from_group(struct kobject *kobj,
 		const struct attribute *attr, const char *group)
 {
-	struct dentry *dir;
+	struct sysfs_dirent *dir_sd;
 
-	dir = lookup_one_len(group, kobj->dentry, strlen(group));
-	if (!IS_ERR(dir)) {
-		sysfs_hash_and_remove(dir, attr->name);
-		dput(dir);
+	dir_sd = sysfs_get_dirent(kobj->sd, group);
+	if (dir_sd) {
+		sysfs_hash_and_remove(dir_sd, attr->name);
+		sysfs_put(dir_sd);
 	}
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 52eed2a7a5ef..f318b73c790c 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -18,26 +18,25 @@
 #include "sysfs.h"
 
 
-static void remove_files(struct dentry * dir, 
-			 const struct attribute_group * grp)
+static void remove_files(struct sysfs_dirent *dir_sd,
+			 const struct attribute_group *grp)
 {
 	struct attribute *const* attr;
 
 	for (attr = grp->attrs; *attr; attr++)
-		sysfs_hash_and_remove(dir,(*attr)->name);
+		sysfs_hash_and_remove(dir_sd, (*attr)->name);
 }
 
-static int create_files(struct dentry * dir,
-			const struct attribute_group * grp)
+static int create_files(struct sysfs_dirent *dir_sd,
+			const struct attribute_group *grp)
 {
 	struct attribute *const* attr;
 	int error = 0;
 
-	for (attr = grp->attrs; *attr && !error; attr++) {
-		error = sysfs_add_file(dir, *attr, SYSFS_KOBJ_ATTR);
-	}
+	for (attr = grp->attrs; *attr && !error; attr++)
+		error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
 	if (error)
-		remove_files(dir,grp);
+		remove_files(dir_sd, grp);
 	return error;
 }
 
@@ -45,44 +44,44 @@ static int create_files(struct dentry * dir,
 int sysfs_create_group(struct kobject * kobj, 
 		       const struct attribute_group * grp)
 {
-	struct dentry * dir;
+	struct sysfs_dirent *sd;
 	int error;
 
-	BUG_ON(!kobj || !kobj->dentry);
+	BUG_ON(!kobj || !kobj->sd);
 
 	if (grp->name) {
-		error = sysfs_create_subdir(kobj,grp->name,&dir);
+		error = sysfs_create_subdir(kobj, grp->name, &sd);
 		if (error)
 			return error;
 	} else
-		dir = kobj->dentry;
-	dir = dget(dir);
-	if ((error = create_files(dir,grp))) {
+		sd = kobj->sd;
+	sysfs_get(sd);
+	error = create_files(sd, grp);
+	if (error) {
 		if (grp->name)
-			sysfs_remove_subdir(dir);
+			sysfs_remove_subdir(sd);
 	}
-	dput(dir);
+	sysfs_put(sd);
 	return error;
 }
 
 void sysfs_remove_group(struct kobject * kobj, 
 			const struct attribute_group * grp)
 {
-	struct dentry * dir;
+	struct sysfs_dirent *dir_sd = kobj->sd;
+	struct sysfs_dirent *sd;
 
 	if (grp->name) {
-		dir = lookup_one_len_kern(grp->name, kobj->dentry,
-				strlen(grp->name));
-		BUG_ON(IS_ERR(dir));
-	}
-	else
-		dir = dget(kobj->dentry);
+		sd = sysfs_get_dirent(dir_sd, grp->name);
+		BUG_ON(!sd);
+	} else
+		sd = sysfs_get(dir_sd);
 
-	remove_files(dir,grp);
+	remove_files(sd, grp);
 	if (grp->name)
-		sysfs_remove_subdir(dir);
-	/* release the ref. taken in this routine */
-	dput(dir);
+		sysfs_remove_subdir(sd);
+
+	sysfs_put(sd);
 }
 
 
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e2f6ef138d20..1be853706e99 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -275,22 +275,23 @@ void sysfs_drop_dentry(struct sysfs_dirent *sd)
 	}
 }
 
-int sysfs_hash_and_remove(struct dentry * dir, const char * name)
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
 {
+	struct dentry *dir;
 	struct sysfs_dirent **pos, *sd;
-	struct sysfs_dirent *parent_sd;
 	int found = 0;
 
-	if (!dir)
+	if (!dir_sd)
 		return -ENOENT;
 
+	dir = dir_sd->s_dentry;
+
 	if (dir->d_inode == NULL)
 		/* no inode means this hasn't been made visible yet */
 		return -ENOENT;
 
-	parent_sd = dir->d_fsdata;
 	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-	for (pos = &parent_sd->s_children; *pos; pos = &(*pos)->s_sibling) {
+	for (pos = &dir_sd->s_children; *pos; pos = &(*pos)->s_sibling) {
 		sd = *pos;
 
 		if (!sysfs_type(sd))
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 45b62e229627..43cc5222f136 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -66,7 +66,6 @@ static int sysfs_add_link(struct sysfs_dirent * parent_sd, const char * name,
  */
 int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char * name)
 {
-	struct dentry *dentry = NULL;
 	struct sysfs_dirent *parent_sd = NULL;
 	struct sysfs_dirent *target_sd = NULL;
 	int error = -EEXIST;
@@ -75,29 +74,28 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
 
 	if (!kobj) {
 		if (sysfs_mount && sysfs_mount->mnt_sb)
-			dentry = sysfs_mount->mnt_sb->s_root;
+			parent_sd = sysfs_mount->mnt_sb->s_root->d_fsdata;
 	} else
-		dentry = kobj->dentry;
+		parent_sd = kobj->sd;
 
-	if (!dentry)
+	if (!parent_sd)
 		return -EFAULT;
-	parent_sd = dentry->d_fsdata;
 
-	/* target->dentry can go away beneath us but is protected with
+	/* target->sd can go away beneath us but is protected with
 	 * kobj_sysfs_assoc_lock.  Fetch target_sd from it.
 	 */
 	spin_lock(&kobj_sysfs_assoc_lock);
-	if (target->dentry)
-		target_sd = sysfs_get(target->dentry->d_fsdata);
+	if (target->sd)
+		target_sd = sysfs_get(target->sd);
 	spin_unlock(&kobj_sysfs_assoc_lock);
 
 	if (!target_sd)
 		return -ENOENT;
 
-	mutex_lock(&dentry->d_inode->i_mutex);
-	if (!sysfs_find_dirent(dentry->d_fsdata, name))
+	mutex_lock(&parent_sd->s_dentry->d_inode->i_mutex);
+	if (!sysfs_find_dirent(parent_sd, name))
 		error = sysfs_add_link(parent_sd, name, target_sd);
-	mutex_unlock(&dentry->d_inode->i_mutex);
+	mutex_unlock(&parent_sd->s_dentry->d_inode->i_mutex);
 
 	if (error)
 		sysfs_put(target_sd);
@@ -114,7 +112,7 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
 
 void sysfs_remove_link(struct kobject * kobj, const char * name)
 {
-	sysfs_hash_and_remove(kobj->dentry,name);
+	sysfs_hash_and_remove(kobj->sd, name);
 }
 
 static int sysfs_get_target_path(struct sysfs_dirent * parent_sd,
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index f1629b4520aa..27a5f4b4e3b0 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -69,12 +69,14 @@ extern void sysfs_attach_dirent(struct sysfs_dirent *sd,
 				struct sysfs_dirent *parent_sd,
 				struct dentry *dentry);
 
-extern int sysfs_add_file(struct dentry *, const struct attribute *, int);
-extern int sysfs_hash_and_remove(struct dentry * dir, const char * name);
+extern int sysfs_add_file(struct sysfs_dirent *dir_sd,
+			  const struct attribute *attr, int type);
+extern int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
 extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name);
 
-extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **);
-extern void sysfs_remove_subdir(struct dentry *);
+extern int sysfs_create_subdir(struct kobject *kobj, const char *name,
+			       struct sysfs_dirent **p_sd);
+extern void sysfs_remove_subdir(struct sysfs_dirent *sd);
 
 extern void sysfs_drop_dentry(struct sysfs_dirent *sd);
 extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index c288e41ba331..06cbf41d32d2 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -55,7 +55,7 @@ struct kobject {
 	struct kobject		* parent;
 	struct kset		* kset;
 	struct kobj_type	* ktype;
-	struct dentry		* dentry;
+	struct sysfs_dirent	* sd;
 	wait_queue_head_t	poll;
 };
 
@@ -71,13 +71,14 @@ extern void kobject_init(struct kobject *);
 extern void kobject_cleanup(struct kobject *);
 
 extern int __must_check kobject_add(struct kobject *);
-extern int __must_check kobject_shadow_add(struct kobject *, struct dentry *);
+extern int __must_check kobject_shadow_add(struct kobject *kobj,
+					   struct sysfs_dirent *shadow_parent);
 extern void kobject_del(struct kobject *);
 
 extern int __must_check kobject_rename(struct kobject *, const char *new_name);
 extern int __must_check kobject_shadow_rename(struct kobject *kobj,
-						struct dentry *new_parent,
-						const char *new_name);
+					      struct sysfs_dirent *new_parent,
+					      const char *new_name);
 extern int __must_check kobject_move(struct kobject *, struct kobject *);
 
 extern int __must_check kobject_register(struct kobject *);
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 2a6df6444e69..4c43030fae5d 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -19,6 +19,7 @@ struct kobject;
 struct module;
 struct nameidata;
 struct dentry;
+struct sysfs_dirent;
 
 /* FIXME
  * The *owner field is no longer used, but leave around
@@ -92,13 +93,14 @@ extern int sysfs_schedule_callback(struct kobject *kobj,
 		void (*func)(void *), void *data, struct module *owner);
 
 extern int __must_check
-sysfs_create_dir(struct kobject *, struct dentry *);
+sysfs_create_dir(struct kobject *kobj, struct sysfs_dirent *shadow_parent_sd);
 
 extern void
 sysfs_remove_dir(struct kobject *);
 
 extern int __must_check
-sysfs_rename_dir(struct kobject *, struct dentry *, const char *new_name);
+sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd,
+		 const char *new_name);
 
 extern int __must_check
 sysfs_move_dir(struct kobject *, struct kobject *);
@@ -138,8 +140,8 @@ void sysfs_notify(struct kobject * k, char *dir, char *attr);
 
 extern int sysfs_make_shadowed_dir(struct kobject *kobj,
 	void * (*follow_link)(struct dentry *, struct nameidata *));
-extern struct dentry *sysfs_create_shadow_dir(struct kobject *kobj);
-extern void sysfs_remove_shadow_dir(struct dentry *dir);
+extern struct sysfs_dirent *sysfs_create_shadow_dir(struct kobject *kobj);
+extern void sysfs_remove_shadow_dir(struct sysfs_dirent *shadow_sd);
 
 extern int __must_check sysfs_init(void);
 
@@ -151,7 +153,8 @@ static inline int sysfs_schedule_callback(struct kobject *kobj,
 	return -ENOSYS;
 }
 
-static inline int sysfs_create_dir(struct kobject * k, struct dentry *shadow)
+static inline int sysfs_create_dir(struct kobject *kobj,
+				   struct sysfs_dirent *shadow_parent_sd)
 {
 	return 0;
 }
@@ -161,9 +164,9 @@ static inline void sysfs_remove_dir(struct kobject * k)
 	;
 }
 
-static inline int sysfs_rename_dir(struct kobject * k,
-					struct dentry *new_parent,
-					const char *new_name)
+static inline int sysfs_rename_dir(struct kobject *kobj,
+				   struct sysfs_dirent *new_parent_sd,
+				   const char *new_name)
 {
 	return 0;
 }
diff --git a/lib/kobject.c b/lib/kobject.c
index b4ebd7631700..4b08e0ff95c8 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -44,7 +44,7 @@ static int populate_dir(struct kobject * kobj)
 	return error;
 }
 
-static int create_dir(struct kobject * kobj, struct dentry *shadow_parent)
+static int create_dir(struct kobject *kobj, struct sysfs_dirent *shadow_parent)
 {
 	int error = 0;
 	if (kobject_name(kobj)) {
@@ -162,7 +162,7 @@ static void unlink(struct kobject * kobj)
  *	@shadow_parent: sysfs directory to add to.
  */
 
-int kobject_shadow_add(struct kobject * kobj, struct dentry *shadow_parent)
+int kobject_shadow_add(struct kobject *kobj, struct sysfs_dirent *shadow_parent)
 {
 	int error = 0;
 	struct kobject * parent;
@@ -338,7 +338,7 @@ int kobject_rename(struct kobject * kobj, const char *new_name)
 	/* Note : if we want to send the new name alone, not the full path,
 	 * we could probably use kobject_name(kobj); */
 
-	error = sysfs_rename_dir(kobj, kobj->parent->dentry, new_name);
+	error = sysfs_rename_dir(kobj, kobj->parent->sd, new_name);
 
 	/* This function is mostly/only used for network interface.
 	 * Some hotplug package track interfaces by their name and
@@ -361,8 +361,8 @@ out:
  *	@new_name: object's new name
  */
 
-int kobject_shadow_rename(struct kobject * kobj, struct dentry *new_parent,
-			  const char *new_name)
+int kobject_shadow_rename(struct kobject *kobj,
+			  struct sysfs_dirent *new_parent, const char *new_name)
 {
 	int error = 0;
 
-- 
cgit v1.3-8-gc7d7


From 51225039f3cf9d250596d1344494b293274b9169 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 14 Jun 2007 04:27:25 +0900
Subject: sysfs: make directory dentries and inodes reclaimable

This patch makes dentries and inodes for sysfs directories
reclaimable.

* sysfs_notify() is modified to walk sysfs_dirent tree instead of
  dentry tree.

* sysfs_update_file() and sysfs_chmod_file() use sysfs_get_dentry() to
  grab the victim dentry.

* sysfs_rename_dir() and sysfs_move_dir() grab all dentries using
  sysfs_get_dentry() on startup.

* Dentries for all shadowed directories are pinned in memory to serve
  as lookup start point.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c        | 231 +++++++++++++++++++++++++++-----------------------
 fs/sysfs/file.c       | 134 +++++++++++++----------------
 fs/sysfs/mount.c      |   2 +-
 fs/sysfs/sysfs.h      |   1 +
 include/linux/sysfs.h |   1 -
 5 files changed, 187 insertions(+), 182 deletions(-)

(limited to 'include/linux')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 987211296106..aee966c44aac 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -568,10 +568,10 @@ static void sysfs_drop_dentry(struct sysfs_dirent *sd)
 	spin_unlock(&dcache_lock);
 	spin_unlock(&sysfs_assoc_lock);
 
-	dput(dentry);
-	/* XXX: unpin if directory, this will go away soon */
-	if (sysfs_type(sd) == SYSFS_DIR)
+	/* dentries for shadowed inodes are pinned, unpin */
+	if (dentry && sysfs_is_shadowed_inode(dentry->d_inode))
 		dput(dentry);
+	dput(dentry);
 
 	/* adjust nlink and update timestamp */
 	inode = ilookup(sysfs_sb, sd->s_ino);
@@ -686,69 +686,29 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
 static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 		      const char *name, struct sysfs_dirent **p_sd)
 {
-	struct dentry *parent = parent_sd->s_dentry;
-	struct sysfs_addrm_cxt acxt;
-	int error;
 	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
-	struct dentry *dentry;
-	struct inode *inode;
+	struct sysfs_addrm_cxt acxt;
 	struct sysfs_dirent *sd;
 
-	sysfs_addrm_start(&acxt, parent_sd);
-
 	/* allocate */
-	dentry = lookup_one_len(name, parent, strlen(name));
-	if (IS_ERR(dentry)) {
-		error = PTR_ERR(dentry);
-		goto out_finish;
-	}
-
-	error = -EEXIST;
-	if (dentry->d_inode)
-		goto out_dput;
-
-	error = -ENOMEM;
 	sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
 	if (!sd)
-		goto out_drop;
+		return -ENOMEM;
 	sd->s_elem.dir.kobj = kobj;
 
-	inode = sysfs_get_inode(sd);
-	if (!inode)
-		goto out_sput;
-
-	if (inode->i_state & I_NEW) {
-		inode->i_op = &sysfs_dir_inode_operations;
-		inode->i_fop = &sysfs_dir_operations;
-		/* directory inodes start off with i_nlink == 2 (for ".") */
-		inc_nlink(inode);
-	}
-
 	/* link in */
-	error = -EEXIST;
-	if (sysfs_find_dirent(parent_sd, name))
-		goto out_iput;
-
-	sysfs_add_one(&acxt, sd);
-	sysfs_link_sibling(sd);
-	sysfs_instantiate(dentry, inode);
-	sysfs_attach_dentry(sd, dentry);
-
-	*p_sd = sd;
-	error = 0;
-	goto out_finish;	/* pin directory dentry in core */
+	sysfs_addrm_start(&acxt, parent_sd);
+	if (!sysfs_find_dirent(parent_sd, name)) {
+		sysfs_add_one(&acxt, sd);
+		sysfs_link_sibling(sd);
+	}
+	if (sysfs_addrm_finish(&acxt)) {
+		*p_sd = sd;
+		return 0;
+	}
 
- out_iput:
-	iput(inode);
- out_sput:
 	sysfs_put(sd);
- out_drop:
-	d_drop(dentry);
- out_dput:
-	dput(dentry);
- out_finish:
-	sysfs_addrm_finish(&acxt);
-	return error;
+	return -EEXIST;
 }
 
 int sysfs_create_subdir(struct kobject *kobj, const char *name,
@@ -785,6 +745,17 @@ int sysfs_create_dir(struct kobject *kobj,
 	return error;
 }
 
+static int sysfs_count_nlink(struct sysfs_dirent *sd)
+{
+	struct sysfs_dirent *child;
+	int nr = 0;
+
+	for (child = sd->s_children; child; child = child->s_sibling)
+		if (sysfs_type(child) == SYSFS_DIR)
+			nr++;
+	return nr + 2;
+}
+
 static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 				struct nameidata *nd)
 {
@@ -795,7 +766,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	int found = 0;
 
 	for (sd = parent_sd->s_children; sd; sd = sd->s_sibling) {
-		if ((sysfs_type(sd) & SYSFS_NOT_PINNED) &&
+		if (sysfs_type(sd) &&
 		    !strcmp(sd->s_name, dentry->d_name.name)) {
 			found = 1;
 			break;
@@ -816,6 +787,11 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	if (inode->i_state & I_NEW) {
 		/* initialize inode according to type */
 		switch (sysfs_type(sd)) {
+		case SYSFS_DIR:
+			inode->i_op = &sysfs_dir_inode_operations;
+			inode->i_fop = &sysfs_dir_operations;
+			inode->i_nlink = sysfs_count_nlink(sd);
+			break;
 		case SYSFS_KOBJ_ATTR:
 			inode->i_size = PAGE_SIZE;
 			inode->i_fop = &sysfs_file_operations;
@@ -876,7 +852,7 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
 	while (*pos) {
 		struct sysfs_dirent *sd = *pos;
 
-		if (sysfs_type(sd) && (sysfs_type(sd) & SYSFS_NOT_PINNED)) {
+		if (sysfs_type(sd) && sysfs_type(sd) != SYSFS_DIR) {
 			*pos = sd->s_sibling;
 			sd->s_sibling = NULL;
 			sysfs_remove_one(&acxt, sd);
@@ -912,14 +888,25 @@ int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd,
 		     const char *new_name)
 {
 	struct sysfs_dirent *sd = kobj->sd;
-	struct dentry *new_parent = new_parent_sd->s_dentry;
-	struct dentry *new_dentry;
-	char *dup_name;
+	struct dentry *new_parent = NULL;
+	struct dentry *old_dentry = NULL, *new_dentry = NULL;
+	const char *dup_name = NULL;
 	int error;
 
-	if (!new_parent_sd)
-		return -EFAULT;
+	/* get dentries */
+	old_dentry = sysfs_get_dentry(sd);
+	if (IS_ERR(old_dentry)) {
+		error = PTR_ERR(old_dentry);
+		goto out_dput;
+	}
+
+	new_parent = sysfs_get_dentry(new_parent_sd);
+	if (IS_ERR(new_parent)) {
+		error = PTR_ERR(new_parent);
+		goto out_dput;
+	}
 
+	/* lock new_parent and get dentry for new name */
 	mutex_lock(&new_parent->d_inode->i_mutex);
 
 	new_dentry = lookup_one_len(new_name, new_parent, strlen(new_name));
@@ -933,14 +920,14 @@ int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd,
 	 * shadows of the same directory
 	 */
 	error = -EINVAL;
-	if (sd->s_parent->s_dentry->d_inode != new_parent->d_inode ||
+	if (old_dentry->d_parent->d_inode != new_parent->d_inode ||
 	    new_dentry->d_parent->d_inode != new_parent->d_inode ||
-	    new_dentry == sd->s_dentry)
-		goto out_dput;
+	    old_dentry == new_dentry)
+		goto out_unlock;
 
 	error = -EEXIST;
 	if (new_dentry->d_inode)
-		goto out_dput;
+		goto out_unlock;
 
 	/* rename kobject and sysfs_dirent */
 	error = -ENOMEM;
@@ -950,9 +937,9 @@ int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd,
 
 	error = kobject_set_name(kobj, "%s", new_name);
 	if (error)
-		goto out_free;
+		goto out_drop;
 
-	kfree(sd->s_name);
+	dup_name = sd->s_name;
 	sd->s_name = new_name;
 
 	/* move under the new parent */
@@ -972,45 +959,58 @@ int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd,
 	error = 0;
 	goto out_unlock;
 
- out_free:
-	kfree(dup_name);
  out_drop:
 	d_drop(new_dentry);
- out_dput:
-	dput(new_dentry);
  out_unlock:
 	mutex_unlock(&new_parent->d_inode->i_mutex);
+ out_dput:
+	kfree(dup_name);
+	dput(new_parent);
+	dput(old_dentry);
+	dput(new_dentry);
 	return error;
 }
 
-int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent)
+int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
 {
-	struct dentry *old_parent_dentry, *new_parent_dentry, *new_dentry;
-	struct sysfs_dirent *new_parent_sd, *sd;
+	struct sysfs_dirent *sd = kobj->sd;
+	struct sysfs_dirent *new_parent_sd;
+	struct dentry *old_parent, *new_parent = NULL;
+	struct dentry *old_dentry = NULL, *new_dentry = NULL;
 	int error;
 
-	old_parent_dentry = kobj->parent ?
-		kobj->parent->sd->s_dentry : sysfs_mount->mnt_sb->s_root;
-	new_parent_dentry = new_parent ?
-		new_parent->sd->s_dentry : sysfs_mount->mnt_sb->s_root;
+	BUG_ON(!sd->s_parent);
+	new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root;
+
+	/* get dentries */
+	old_dentry = sysfs_get_dentry(sd);
+	if (IS_ERR(old_dentry)) {
+		error = PTR_ERR(old_dentry);
+		goto out_dput;
+	}
+	old_parent = sd->s_parent->s_dentry;
+
+	new_parent = sysfs_get_dentry(new_parent_sd);
+	if (IS_ERR(new_parent)) {
+		error = PTR_ERR(new_parent);
+		goto out_dput;
+	}
 
-	if (old_parent_dentry->d_inode == new_parent_dentry->d_inode)
-		return 0;	/* nothing to move */
+	if (old_parent->d_inode == new_parent->d_inode) {
+		error = 0;
+		goto out_dput;	/* nothing to move */
+	}
 again:
-	mutex_lock(&old_parent_dentry->d_inode->i_mutex);
-	if (!mutex_trylock(&new_parent_dentry->d_inode->i_mutex)) {
-		mutex_unlock(&old_parent_dentry->d_inode->i_mutex);
+	mutex_lock(&old_parent->d_inode->i_mutex);
+	if (!mutex_trylock(&new_parent->d_inode->i_mutex)) {
+		mutex_unlock(&old_parent->d_inode->i_mutex);
 		goto again;
 	}
 
-	new_parent_sd = new_parent_dentry->d_fsdata;
-	sd = kobj->sd;
-
-	new_dentry = lookup_one_len(kobj->name, new_parent_dentry,
-				    strlen(kobj->name));
+	new_dentry = lookup_one_len(kobj->name, new_parent, strlen(kobj->name));
 	if (IS_ERR(new_dentry)) {
 		error = PTR_ERR(new_dentry);
-		goto out;
+		goto out_unlock;
 	} else
 		error = 0;
 	d_add(new_dentry, NULL);
@@ -1027,10 +1027,14 @@ again:
 	sysfs_link_sibling(sd);
 
 	mutex_unlock(&sysfs_mutex);
-out:
-	mutex_unlock(&new_parent_dentry->d_inode->i_mutex);
-	mutex_unlock(&old_parent_dentry->d_inode->i_mutex);
 
+ out_unlock:
+	mutex_unlock(&new_parent->d_inode->i_mutex);
+	mutex_unlock(&old_parent->d_inode->i_mutex);
+ out_dput:
+	dput(new_parent);
+	dput(old_dentry);
+	dput(new_dentry);
 	return error;
 }
 
@@ -1191,12 +1195,20 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
 int sysfs_make_shadowed_dir(struct kobject *kobj,
 	void * (*follow_link)(struct dentry *, struct nameidata *))
 {
+	struct dentry *dentry;
 	struct inode *inode;
 	struct inode_operations *i_op;
 
-	inode = kobj->sd->s_dentry->d_inode;
-	if (inode->i_op != &sysfs_dir_inode_operations)
+	/* get dentry for @kobj->sd, dentry of a shadowed dir is pinned */
+	dentry = sysfs_get_dentry(kobj->sd);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	inode = dentry->d_inode;
+	if (inode->i_op != &sysfs_dir_inode_operations) {
+		dput(dentry);
 		return -EINVAL;
+	}
 
 	i_op = kmalloc(sizeof(*i_op), GFP_KERNEL);
 	if (!i_op)
@@ -1223,17 +1235,23 @@ int sysfs_make_shadowed_dir(struct kobject *kobj,
 
 struct sysfs_dirent *sysfs_create_shadow_dir(struct kobject *kobj)
 {
-	struct dentry *dir = kobj->sd->s_dentry;
-	struct inode *inode = dir->d_inode;
-	struct dentry *parent = dir->d_parent;
-	struct sysfs_dirent *parent_sd = parent->d_fsdata;
-	struct dentry *shadow;
+	struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
+	struct dentry *dir, *parent, *shadow;
+	struct inode *inode;
 	struct sysfs_dirent *sd;
 	struct sysfs_addrm_cxt acxt;
 
+	dir = sysfs_get_dentry(kobj->sd);
+	if (IS_ERR(dir)) {
+		sd = (void *)dir;
+		goto out;
+	}
+	parent = dir->d_parent;
+
+	inode = dir->d_inode;
 	sd = ERR_PTR(-EINVAL);
 	if (!sysfs_is_shadowed_inode(inode))
-		goto out;
+		goto out_dput;
 
 	shadow = d_alloc(parent, &dir->d_name);
 	if (!shadow)
@@ -1258,12 +1276,15 @@ struct sysfs_dirent *sysfs_create_shadow_dir(struct kobject *kobj)
 
 	dget(shadow);		/* Extra count - pin the dentry in core */
 
-out:
-	return sd;
-nomem:
+	goto out_dput;
+
+ nomem:
 	dput(shadow);
 	sd = ERR_PTR(-ENOMEM);
-	goto out;
+ out_dput:
+	dput(dir);
+ out:
+	return sd;
 }
 
 /**
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 69bacf1db596..cc497994b2a8 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -362,43 +362,22 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
 	return POLLERR|POLLPRI;
 }
 
-
-static struct dentry *step_down(struct dentry *dir, const char * name)
+void sysfs_notify(struct kobject *k, char *dir, char *attr)
 {
-	struct dentry * de;
-
-	if (dir == NULL || dir->d_inode == NULL)
-		return NULL;
-
-	mutex_lock(&dir->d_inode->i_mutex);
-	de = lookup_one_len(name, dir, strlen(name));
-	mutex_unlock(&dir->d_inode->i_mutex);
-	dput(dir);
-	if (IS_ERR(de))
-		return NULL;
-	if (de->d_inode == NULL) {
-		dput(de);
-		return NULL;
-	}
-	return de;
-}
+	struct sysfs_dirent *sd = k->sd;
 
-void sysfs_notify(struct kobject * k, char *dir, char *attr)
-{
-	struct dentry *de = k->sd->s_dentry;
-	if (de)
-		dget(de);
-	if (de && dir)
-		de = step_down(de, dir);
-	if (de && attr)
-		de = step_down(de, attr);
-	if (de) {
-		struct sysfs_dirent * sd = de->d_fsdata;
-		if (sd)
-			atomic_inc(&sd->s_event);
+	mutex_lock(&sysfs_mutex);
+
+	if (sd && dir)
+		sd = sysfs_find_dirent(sd, dir);
+	if (sd && attr)
+		sd = sysfs_find_dirent(sd, attr);
+	if (sd) {
+		atomic_inc(&sd->s_event);
 		wake_up_interruptible(&k->poll);
-		dput(de);
 	}
+
+	mutex_unlock(&sysfs_mutex);
 }
 EXPORT_SYMBOL_GPL(sysfs_notify);
 
@@ -485,30 +464,31 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
  */
 int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
 {
-	struct dentry *dir = kobj->sd->s_dentry;
-	struct dentry * victim;
-	int res = -ENOENT;
-
-	mutex_lock(&dir->d_inode->i_mutex);
-	victim = lookup_one_len(attr->name, dir, strlen(attr->name));
-	if (!IS_ERR(victim)) {
-		/* make sure dentry is really there */
-		if (victim->d_inode && 
-		    (victim->d_parent->d_inode == dir->d_inode)) {
-			victim->d_inode->i_mtime = CURRENT_TIME;
-			fsnotify_modify(victim);
-			res = 0;
-		} else
-			d_drop(victim);
-		
-		/**
-		 * Drop the reference acquired from lookup_one_len() above.
-		 */
-		dput(victim);
+	struct sysfs_dirent *victim_sd = NULL;
+	struct dentry *victim = NULL;
+	int rc;
+
+	rc = -ENOENT;
+	victim_sd = sysfs_get_dirent(kobj->sd, attr->name);
+	if (!victim_sd)
+		goto out;
+
+	victim = sysfs_get_dentry(victim_sd);
+	if (IS_ERR(victim)) {
+		rc = PTR_ERR(victim);
+		victim = NULL;
+		goto out;
 	}
-	mutex_unlock(&dir->d_inode->i_mutex);
 
-	return res;
+	mutex_lock(&victim->d_inode->i_mutex);
+	victim->d_inode->i_mtime = CURRENT_TIME;
+	fsnotify_modify(victim);
+	mutex_unlock(&victim->d_inode->i_mutex);
+	rc = 0;
+ out:
+	dput(victim);
+	sysfs_put(victim_sd);
+	return rc;
 }
 
 
@@ -521,30 +501,34 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
  */
 int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
 {
-	struct dentry *dir = kobj->sd->s_dentry;
-	struct dentry *victim;
+	struct sysfs_dirent *victim_sd = NULL;
+	struct dentry *victim = NULL;
 	struct inode * inode;
 	struct iattr newattrs;
-	int res = -ENOENT;
-
-	mutex_lock(&dir->d_inode->i_mutex);
-	victim = lookup_one_len(attr->name, dir, strlen(attr->name));
-	if (!IS_ERR(victim)) {
-		if (victim->d_inode &&
-		    (victim->d_parent->d_inode == dir->d_inode)) {
-			inode = victim->d_inode;
-			mutex_lock(&inode->i_mutex);
-			newattrs.ia_mode = (mode & S_IALLUGO) |
-						(inode->i_mode & ~S_IALLUGO);
-			newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-			res = notify_change(victim, &newattrs);
-			mutex_unlock(&inode->i_mutex);
-		}
-		dput(victim);
+	int rc;
+
+	rc = -ENOENT;
+	victim_sd = sysfs_get_dirent(kobj->sd, attr->name);
+	if (!victim_sd)
+		goto out;
+
+	victim = sysfs_get_dentry(victim_sd);
+	if (IS_ERR(victim)) {
+		rc = PTR_ERR(victim);
+		victim = NULL;
+		goto out;
 	}
-	mutex_unlock(&dir->d_inode->i_mutex);
 
-	return res;
+	inode = victim->d_inode;
+	mutex_lock(&inode->i_mutex);
+	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
+	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
+	rc = notify_change(victim, &newattrs);
+	mutex_unlock(&inode->i_mutex);
+ out:
+	dput(victim);
+	sysfs_put(victim_sd);
+	return rc;
 }
 EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 078537e5d696..402cc356203c 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -24,7 +24,7 @@ static const struct super_operations sysfs_ops = {
 	.drop_inode	= sysfs_delete_inode,
 };
 
-static struct sysfs_dirent sysfs_root = {
+struct sysfs_dirent sysfs_root = {
 	.s_count	= ATOMIC_INIT(1),
 	.s_flags	= SYSFS_ROOT,
 	.s_mode		= S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 72530dc666fd..6a37f2386a8d 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -52,6 +52,7 @@ struct sysfs_addrm_cxt {
 };
 
 extern struct vfsmount * sysfs_mount;
+extern struct sysfs_dirent sysfs_root;
 extern struct kmem_cache *sysfs_dir_cachep;
 
 extern struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 4c43030fae5d..2f58ca1af770 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -81,7 +81,6 @@ struct sysfs_ops {
 #define SYSFS_KOBJ_ATTR 	0x0004
 #define SYSFS_KOBJ_BIN_ATTR	0x0008
 #define SYSFS_KOBJ_LINK 	0x0020
-#define SYSFS_NOT_PINNED	(SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR | SYSFS_KOBJ_LINK)
 #define SYSFS_COPY_NAME		(SYSFS_DIR | SYSFS_KOBJ_LINK)
 
 #define SYSFS_FLAG_MASK		~SYSFS_TYPE_MASK
-- 
cgit v1.3-8-gc7d7


From 91a6902958f052358899f58683d44e36228d85c2 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sat, 9 Jun 2007 13:57:22 +0800
Subject: sysfs: add parameter "struct bin_attribute *" in .read/.write methods
 for sysfs binary attributes

Well, first of all, I don't want to change so many files either.

What I do:
Adding a new parameter "struct bin_attribute *" in the
.read/.write methods for the sysfs binary attributes.

In fact, only the four lines change in fs/sysfs/bin.c and
include/linux/sysfs.h do the real work.
But I have to update all the files that use binary attributes
to make them compatible with the new .read and .write methods.
I'm not sure if I missed any. :(

Why I do this:
For a sysfs attribute, we can get a pointer pointing to the
struct attribute in the .show/.store method,
while we can't do this for the binary attributes.
I don't know why this is different, but this does make it not
so handy to use the binary attributes as the regular ones.
So I think this patch is reasonable. :)

Who benefits from it:
The patch that exposes ACPI tables in sysfs
requires such an improvement.
All the table binary attributes share the same .read method.
Parameter "struct bin_attribute *" is used to get
the table signature and instance number which are used to
distinguish different ACPI table binary attributes.

Without this parameter, we need to offer different .read methods
for different ACPI table binary attributes.
This is impossible as there are various ACPI tables on different
platforms, and we don't know what they are until they are loaded.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 .../firmware_sample_firmware_class.c               |  2 +
 drivers/base/firmware_class.c                      |  4 +-
 drivers/firmware/dcdbas.c                          | 10 +++--
 drivers/firmware/dell_rbu.c                        | 25 ++++++-----
 drivers/i2c/chips/eeprom.c                         |  3 +-
 drivers/i2c/chips/max6875.c                        |  5 ++-
 drivers/pci/hotplug/acpiphp_ibm.c                  |  6 ++-
 drivers/pci/pci-sysfs.c                            | 18 +++++---
 drivers/pcmcia/socket_sysfs.c                      |  8 +++-
 drivers/rapidio/rio-sysfs.c                        |  6 ++-
 drivers/rtc/rtc-ds1553.c                           | 10 +++--
 drivers/rtc/rtc-ds1742.c                           | 10 +++--
 drivers/s390/cio/chp.c                             | 10 +++--
 drivers/scsi/arcmsr/arcmsr_attr.c                  | 15 ++++---
 drivers/scsi/ipr.c                                 | 18 +++++---
 drivers/scsi/libsas/sas_expander.c                 | 16 ++++---
 drivers/scsi/lpfc/lpfc_attr.c                      | 12 ++++--
 drivers/scsi/qla2xxx/qla_attr.c                    | 50 +++++++++++++---------
 drivers/spi/at25.c                                 |  6 ++-
 drivers/video/aty/radeon_base.c                    |  8 +++-
 drivers/w1/slaves/w1_ds2433.c                      | 10 +++--
 drivers/w1/slaves/w1_therm.c                       |  7 ++-
 drivers/w1/w1.c                                    | 12 ++++--
 drivers/zorro/zorro-sysfs.c                        |  5 ++-
 fs/sysfs/bin.c                                     |  4 +-
 include/linux/sysfs.h                              |  6 ++-
 net/bridge/br_sysfs_br.c                           |  5 ++-
 27 files changed, 185 insertions(+), 106 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/firmware_class/firmware_sample_firmware_class.c b/Documentation/firmware_class/firmware_sample_firmware_class.c
index 4994f1f28f8c..fba943aacf93 100644
--- a/Documentation/firmware_class/firmware_sample_firmware_class.c
+++ b/Documentation/firmware_class/firmware_sample_firmware_class.c
@@ -78,6 +78,7 @@ static CLASS_DEVICE_ATTR(loading, 0644,
 			 firmware_loading_show, firmware_loading_store);
 
 static ssize_t firmware_data_read(struct kobject *kobj,
+				  struct bin_attribute *bin_attr,
 				  char *buffer, loff_t offset, size_t count)
 {
 	struct class_device *class_dev = to_class_dev(kobj);
@@ -88,6 +89,7 @@ static ssize_t firmware_data_read(struct kobject *kobj,
 	return count;
 }
 static ssize_t firmware_data_write(struct kobject *kobj,
+				   struct bin_attribute *bin_attr,
 				   char *buffer, loff_t offset, size_t count)
 {
 	struct class_device *class_dev = to_class_dev(kobj);
diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 0e511485d2e6..53f0ee6f3016 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -175,7 +175,7 @@ static ssize_t firmware_loading_store(struct device *dev,
 static DEVICE_ATTR(loading, 0644, firmware_loading_show, firmware_loading_store);
 
 static ssize_t
-firmware_data_read(struct kobject *kobj,
+firmware_data_read(struct kobject *kobj, struct bin_attribute *bin_attr,
 		   char *buffer, loff_t offset, size_t count)
 {
 	struct device *dev = to_dev(kobj);
@@ -240,7 +240,7 @@ fw_realloc_buffer(struct firmware_priv *fw_priv, int min_size)
  *	the driver as a firmware image.
  **/
 static ssize_t
-firmware_data_write(struct kobject *kobj,
+firmware_data_write(struct kobject *kobj, struct bin_attribute *bin_attr,
 		    char *buffer, loff_t offset, size_t count)
 {
 	struct device *dev = to_dev(kobj);
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c
index 1865b56fb141..18cdcb3ae1ca 100644
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -149,8 +149,9 @@ static ssize_t smi_data_buf_size_store(struct device *dev,
 	return count;
 }
 
-static ssize_t smi_data_read(struct kobject *kobj, char *buf, loff_t pos,
-			     size_t count)
+static ssize_t smi_data_read(struct kobject *kobj,
+			     struct bin_attribute *bin_attr,
+			     char *buf, loff_t pos, size_t count)
 {
 	size_t max_read;
 	ssize_t ret;
@@ -170,8 +171,9 @@ out:
 	return ret;
 }
 
-static ssize_t smi_data_write(struct kobject *kobj, char *buf, loff_t pos,
-			      size_t count)
+static ssize_t smi_data_write(struct kobject *kobj,
+			      struct bin_attribute *bin_attr,
+			      char *buf, loff_t pos, size_t count)
 {
 	ssize_t ret;
 
diff --git a/drivers/firmware/dell_rbu.c b/drivers/firmware/dell_rbu.c
index f8afecb7d0cf..477a3d0e3caf 100644
--- a/drivers/firmware/dell_rbu.c
+++ b/drivers/firmware/dell_rbu.c
@@ -543,8 +543,9 @@ static ssize_t read_rbu_mono_data(char *buffer, loff_t pos, size_t count)
 	return ret_count;
 }
 
-static ssize_t read_rbu_data(struct kobject *kobj, char *buffer,
-	loff_t pos, size_t count)
+static ssize_t read_rbu_data(struct kobject *kobj,
+			     struct bin_attribute *bin_attr,
+			     char *buffer, loff_t pos, size_t count)
 {
 	ssize_t ret_count = 0;
 
@@ -591,8 +592,9 @@ static void callbackfn_rbu(const struct firmware *fw, void *context)
 	spin_unlock(&rbu_data.lock);
 }
 
-static ssize_t read_rbu_image_type(struct kobject *kobj, char *buffer,
-	loff_t pos, size_t count)
+static ssize_t read_rbu_image_type(struct kobject *kobj,
+				   struct bin_attribute *bin_attr,
+				   char *buffer, loff_t pos, size_t count)
 {
 	int size = 0;
 	if (!pos)
@@ -600,8 +602,9 @@ static ssize_t read_rbu_image_type(struct kobject *kobj, char *buffer,
 	return size;
 }
 
-static ssize_t write_rbu_image_type(struct kobject *kobj, char *buffer,
-	loff_t pos, size_t count)
+static ssize_t write_rbu_image_type(struct kobject *kobj,
+				    struct bin_attribute *bin_attr,
+				    char *buffer, loff_t pos, size_t count)
 {
 	int rc = count;
 	int req_firm_rc = 0;
@@ -660,8 +663,9 @@ static ssize_t write_rbu_image_type(struct kobject *kobj, char *buffer,
 	return rc;
 }
 
-static ssize_t read_rbu_packet_size(struct kobject *kobj, char *buffer,
-	loff_t pos, size_t count)
+static ssize_t read_rbu_packet_size(struct kobject *kobj,
+				    struct bin_attribute *bin_attr,
+				    char *buffer, loff_t pos, size_t count)
 {
 	int size = 0;
 	if (!pos) {
@@ -672,8 +676,9 @@ static ssize_t read_rbu_packet_size(struct kobject *kobj, char *buffer,
 	return size;
 }
 
-static ssize_t write_rbu_packet_size(struct kobject *kobj, char *buffer,
-	loff_t pos, size_t count)
+static ssize_t write_rbu_packet_size(struct kobject *kobj,
+				     struct bin_attribute *bin_attr,
+				     char *buffer, loff_t pos, size_t count)
 {
 	unsigned long temp;
 	spin_lock(&rbu_data.lock);
diff --git a/drivers/i2c/chips/eeprom.c b/drivers/i2c/chips/eeprom.c
index 5990dd5fc773..332816431105 100644
--- a/drivers/i2c/chips/eeprom.c
+++ b/drivers/i2c/chips/eeprom.c
@@ -110,7 +110,8 @@ exit:
 	mutex_unlock(&data->update_lock);
 }
 
-static ssize_t eeprom_read(struct kobject *kobj, char *buf, loff_t off, size_t count)
+static ssize_t eeprom_read(struct kobject *kobj, struct bin_attribute *bin_attr,
+			   char *buf, loff_t off, size_t count)
 {
 	struct i2c_client *client = to_i2c_client(container_of(kobj, struct device, kobj));
 	struct eeprom_data *data = i2c_get_clientdata(client);
diff --git a/drivers/i2c/chips/max6875.c b/drivers/i2c/chips/max6875.c
index 1405ce5b236c..4e238c0a7ca3 100644
--- a/drivers/i2c/chips/max6875.c
+++ b/drivers/i2c/chips/max6875.c
@@ -125,8 +125,9 @@ exit_up:
 	mutex_unlock(&data->update_lock);
 }
 
-static ssize_t max6875_read(struct kobject *kobj, char *buf, loff_t off,
-			    size_t count)
+static ssize_t max6875_read(struct kobject *kobj,
+			    struct bin_attribute *bin_attr,
+			    char *buf, loff_t off, size_t count)
 {
 	struct i2c_client *client = kobj_to_i2c_client(kobj);
 	struct max6875_data *data = i2c_get_clientdata(client);
diff --git a/drivers/pci/hotplug/acpiphp_ibm.c b/drivers/pci/hotplug/acpiphp_ibm.c
index 74556ec31a5b..70db38c0ced9 100644
--- a/drivers/pci/hotplug/acpiphp_ibm.c
+++ b/drivers/pci/hotplug/acpiphp_ibm.c
@@ -106,7 +106,8 @@ static int ibm_get_attention_status(struct hotplug_slot *slot, u8 *status);
 static void ibm_handle_events(acpi_handle handle, u32 event, void *context);
 static int ibm_get_table_from_acpi(char **bufp);
 static ssize_t ibm_read_apci_table(struct kobject *kobj,
-		char *buffer, loff_t pos, size_t size);
+				   struct bin_attribute *bin_attr,
+				   char *buffer, loff_t pos, size_t size);
 static acpi_status __init ibm_find_acpi_device(acpi_handle handle,
 		u32 lvl, void *context, void **rv);
 static int __init ibm_acpiphp_init(void);
@@ -357,7 +358,8 @@ read_table_done:
  * our solution is to only allow reading the table in all at once
  **/
 static ssize_t ibm_read_apci_table(struct kobject *kobj,
-		char *buffer, loff_t pos, size_t size)
+				   struct bin_attribute *bin_attr,
+				   char *buffer, loff_t pos, size_t size)
 {
 	int bytes_read = -EINVAL;
 	char *table = NULL;
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index d448f8df8613..6543cbe83be5 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -213,7 +213,8 @@ struct device_attribute pci_dev_attrs[] = {
 };
 
 static ssize_t
-pci_read_config(struct kobject *kobj, char *buf, loff_t off, size_t count)
+pci_read_config(struct kobject *kobj, struct bin_attribute *bin_attr,
+		char *buf, loff_t off, size_t count)
 {
 	struct pci_dev *dev = to_pci_dev(container_of(kobj,struct device,kobj));
 	unsigned int size = 64;
@@ -285,7 +286,8 @@ pci_read_config(struct kobject *kobj, char *buf, loff_t off, size_t count)
 }
 
 static ssize_t
-pci_write_config(struct kobject *kobj, char *buf, loff_t off, size_t count)
+pci_write_config(struct kobject *kobj, struct bin_attribute *bin_attr,
+		 char *buf, loff_t off, size_t count)
 {
 	struct pci_dev *dev = to_pci_dev(container_of(kobj,struct device,kobj));
 	unsigned int size = count;
@@ -352,7 +354,8 @@ pci_write_config(struct kobject *kobj, char *buf, loff_t off, size_t count)
  * callback routine (pci_legacy_read).
  */
 ssize_t
-pci_read_legacy_io(struct kobject *kobj, char *buf, loff_t off, size_t count)
+pci_read_legacy_io(struct kobject *kobj, struct bin_attribute *bin_attr,
+		   char *buf, loff_t off, size_t count)
 {
         struct pci_bus *bus = to_pci_bus(container_of(kobj,
                                                       struct class_device,
@@ -376,7 +379,8 @@ pci_read_legacy_io(struct kobject *kobj, char *buf, loff_t off, size_t count)
  * callback routine (pci_legacy_write).
  */
 ssize_t
-pci_write_legacy_io(struct kobject *kobj, char *buf, loff_t off, size_t count)
+pci_write_legacy_io(struct kobject *kobj, struct bin_attribute *bin_attr,
+		    char *buf, loff_t off, size_t count)
 {
         struct pci_bus *bus = to_pci_bus(container_of(kobj,
 						      struct class_device,
@@ -528,7 +532,8 @@ static inline void pci_remove_resource_files(struct pci_dev *dev) { return; }
  * writing anything except 0 enables it
  */
 static ssize_t
-pci_write_rom(struct kobject *kobj, char *buf, loff_t off, size_t count)
+pci_write_rom(struct kobject *kobj, struct bin_attribute *bin_attr,
+	      char *buf, loff_t off, size_t count)
 {
 	struct pci_dev *pdev = to_pci_dev(container_of(kobj, struct device, kobj));
 
@@ -551,7 +556,8 @@ pci_write_rom(struct kobject *kobj, char *buf, loff_t off, size_t count)
  * device corresponding to @kobj.
  */
 static ssize_t
-pci_read_rom(struct kobject *kobj, char *buf, loff_t off, size_t count)
+pci_read_rom(struct kobject *kobj, struct bin_attribute *bin_attr,
+	     char *buf, loff_t off, size_t count)
 {
 	struct pci_dev *pdev = to_pci_dev(container_of(kobj, struct device, kobj));
 	void __iomem *rom;
diff --git a/drivers/pcmcia/socket_sysfs.c b/drivers/pcmcia/socket_sysfs.c
index dbfbe65779e5..b4409002b7f8 100644
--- a/drivers/pcmcia/socket_sysfs.c
+++ b/drivers/pcmcia/socket_sysfs.c
@@ -283,7 +283,9 @@ static ssize_t pccard_extract_cis(struct pcmcia_socket *s, char *buf, loff_t off
 	return (ret);
 }
 
-static ssize_t pccard_show_cis(struct kobject *kobj, char *buf, loff_t off, size_t count)
+static ssize_t pccard_show_cis(struct kobject *kobj,
+			       struct bin_attribute *bin_attr,
+			       char *buf, loff_t off, size_t count)
 {
 	unsigned int size = 0x200;
 
@@ -311,7 +313,9 @@ static ssize_t pccard_show_cis(struct kobject *kobj, char *buf, loff_t off, size
 	return (count);
 }
 
-static ssize_t pccard_store_cis(struct kobject *kobj, char *buf, loff_t off, size_t count)
+static ssize_t pccard_store_cis(struct kobject *kobj,
+				struct bin_attribute *bin_attr,
+				char *buf, loff_t off, size_t count)
 {
 	struct pcmcia_socket *s = to_socket(container_of(kobj, struct device, kobj));
 	cisdump_t *cis;
diff --git a/drivers/rapidio/rio-sysfs.c b/drivers/rapidio/rio-sysfs.c
index a3972b9f96e6..659e31164cf0 100644
--- a/drivers/rapidio/rio-sysfs.c
+++ b/drivers/rapidio/rio-sysfs.c
@@ -67,7 +67,8 @@ struct device_attribute rio_dev_attrs[] = {
 };
 
 static ssize_t
-rio_read_config(struct kobject *kobj, char *buf, loff_t off, size_t count)
+rio_read_config(struct kobject *kobj, struct bin_attribute *bin_attr,
+		char *buf, loff_t off, size_t count)
 {
 	struct rio_dev *dev =
 	    to_rio_dev(container_of(kobj, struct device, kobj));
@@ -137,7 +138,8 @@ rio_read_config(struct kobject *kobj, char *buf, loff_t off, size_t count)
 }
 
 static ssize_t
-rio_write_config(struct kobject *kobj, char *buf, loff_t off, size_t count)
+rio_write_config(struct kobject *kobj, struct bin_attribute *bin_attr,
+		 char *buf, loff_t off, size_t count)
 {
 	struct rio_dev *dev =
 	    to_rio_dev(container_of(kobj, struct device, kobj));
diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c
index b024cfb558f4..f98a83a11aae 100644
--- a/drivers/rtc/rtc-ds1553.c
+++ b/drivers/rtc/rtc-ds1553.c
@@ -258,8 +258,9 @@ static const struct rtc_class_ops ds1553_rtc_ops = {
 	.ioctl		= ds1553_rtc_ioctl,
 };
 
-static ssize_t ds1553_nvram_read(struct kobject *kobj, char *buf,
-				 loff_t pos, size_t size)
+static ssize_t ds1553_nvram_read(struct kobject *kobj,
+				 struct bin_attribute *bin_attr,
+				 char *buf, loff_t pos, size_t size)
 {
 	struct platform_device *pdev =
 		to_platform_device(container_of(kobj, struct device, kobj));
@@ -272,8 +273,9 @@ static ssize_t ds1553_nvram_read(struct kobject *kobj, char *buf,
 	return count;
 }
 
-static ssize_t ds1553_nvram_write(struct kobject *kobj, char *buf,
-				  loff_t pos, size_t size)
+static ssize_t ds1553_nvram_write(struct kobject *kobj,
+				  struct bin_attribute *bin_attr,
+				  char *buf, loff_t pos, size_t size)
 {
 	struct platform_device *pdev =
 		to_platform_device(container_of(kobj, struct device, kobj));
diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c
index 1638acdbc913..d1778ae8bca5 100644
--- a/drivers/rtc/rtc-ds1742.c
+++ b/drivers/rtc/rtc-ds1742.c
@@ -127,8 +127,9 @@ static const struct rtc_class_ops ds1742_rtc_ops = {
 	.set_time	= ds1742_rtc_set_time,
 };
 
-static ssize_t ds1742_nvram_read(struct kobject *kobj, char *buf,
-				 loff_t pos, size_t size)
+static ssize_t ds1742_nvram_read(struct kobject *kobj,
+				 struct bin_attribute *bin_attr,
+				 char *buf, loff_t pos, size_t size)
 {
 	struct platform_device *pdev =
 		to_platform_device(container_of(kobj, struct device, kobj));
@@ -141,8 +142,9 @@ static ssize_t ds1742_nvram_read(struct kobject *kobj, char *buf,
 	return count;
 }
 
-static ssize_t ds1742_nvram_write(struct kobject *kobj, char *buf,
-				  loff_t pos, size_t size)
+static ssize_t ds1742_nvram_write(struct kobject *kobj,
+				  struct bin_attribute *bin_attr,
+				  char *buf, loff_t pos, size_t size)
 {
 	struct platform_device *pdev =
 		to_platform_device(container_of(kobj, struct device, kobj));
diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c
index 96a8a72a6083..b57d93d986c0 100644
--- a/drivers/s390/cio/chp.c
+++ b/drivers/s390/cio/chp.c
@@ -141,8 +141,9 @@ static int s390_vary_chpid(struct chp_id chpid, int on)
 /*
  * Channel measurement related functions
  */
-static ssize_t chp_measurement_chars_read(struct kobject *kobj, char *buf,
-					  loff_t off, size_t count)
+static ssize_t chp_measurement_chars_read(struct kobject *kobj,
+					  struct bin_attribute *bin_attr,
+					  char *buf, loff_t off, size_t count)
 {
 	struct channel_path *chp;
 	unsigned int size;
@@ -192,8 +193,9 @@ static void chp_measurement_copy_block(struct cmg_entry *buf,
 	} while (reference_buf.values[0] != buf->values[0]);
 }
 
-static ssize_t chp_measurement_read(struct kobject *kobj, char *buf,
-				    loff_t off, size_t count)
+static ssize_t chp_measurement_read(struct kobject *kobj,
+				    struct bin_attribute *bin_attr,
+				    char *buf, loff_t off, size_t count)
 {
 	struct channel_path *chp;
 	struct channel_subsystem *css;
diff --git a/drivers/scsi/arcmsr/arcmsr_attr.c b/drivers/scsi/arcmsr/arcmsr_attr.c
index 8908228bc134..06c0dce3b839 100644
--- a/drivers/scsi/arcmsr/arcmsr_attr.c
+++ b/drivers/scsi/arcmsr/arcmsr_attr.c
@@ -59,8 +59,9 @@
 struct class_device_attribute *arcmsr_host_attrs[];
 
 static ssize_t
-arcmsr_sysfs_iop_message_read(struct kobject *kobj, char *buf, loff_t off,
-    size_t count)
+arcmsr_sysfs_iop_message_read(struct kobject *kobj,
+			      struct bin_attribute *bin_attr,
+			      char *buf, loff_t off, size_t count)
 {
 	struct class_device *cdev = container_of(kobj,struct class_device,kobj);
 	struct Scsi_Host *host = class_to_shost(cdev);
@@ -105,8 +106,9 @@ arcmsr_sysfs_iop_message_read(struct kobject *kobj, char *buf, loff_t off,
 }
 
 static ssize_t
-arcmsr_sysfs_iop_message_write(struct kobject *kobj, char *buf, loff_t off,
-    size_t count)
+arcmsr_sysfs_iop_message_write(struct kobject *kobj,
+			       struct bin_attribute *bin_attr,
+			       char *buf, loff_t off, size_t count)
 {
 	struct class_device *cdev = container_of(kobj,struct class_device,kobj);
 	struct Scsi_Host *host = class_to_shost(cdev);
@@ -152,8 +154,9 @@ arcmsr_sysfs_iop_message_write(struct kobject *kobj, char *buf, loff_t off,
 }
 
 static ssize_t
-arcmsr_sysfs_iop_message_clear(struct kobject *kobj, char *buf, loff_t off,
-    size_t count)
+arcmsr_sysfs_iop_message_clear(struct kobject *kobj,
+			       struct bin_attribute *bin_attr,
+			       char *buf, loff_t off, size_t count)
 {
 	struct class_device *cdev = container_of(kobj,struct class_device,kobj);
 	struct Scsi_Host *host = class_to_shost(cdev);
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index fa6ff295e568..4a3083ea59d5 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -2465,6 +2465,7 @@ restart:
 /**
  * ipr_read_trace - Dump the adapter trace
  * @kobj:		kobject struct
+ * @bin_attr:		bin_attribute struct
  * @buf:		buffer
  * @off:		offset
  * @count:		buffer size
@@ -2472,8 +2473,9 @@ restart:
  * Return value:
  *	number of bytes printed to buffer
  **/
-static ssize_t ipr_read_trace(struct kobject *kobj, char *buf,
-			      loff_t off, size_t count)
+static ssize_t ipr_read_trace(struct kobject *kobj,
+			      struct bin_attribute *bin_attr,
+			      char *buf, loff_t off, size_t count)
 {
 	struct class_device *cdev = container_of(kobj,struct class_device,kobj);
 	struct Scsi_Host *shost = class_to_shost(cdev);
@@ -3166,6 +3168,7 @@ static struct class_device_attribute *ipr_ioa_attrs[] = {
 /**
  * ipr_read_dump - Dump the adapter
  * @kobj:		kobject struct
+ * @bin_attr:		bin_attribute struct
  * @buf:		buffer
  * @off:		offset
  * @count:		buffer size
@@ -3173,8 +3176,9 @@ static struct class_device_attribute *ipr_ioa_attrs[] = {
  * Return value:
  *	number of bytes printed to buffer
  **/
-static ssize_t ipr_read_dump(struct kobject *kobj, char *buf,
-			      loff_t off, size_t count)
+static ssize_t ipr_read_dump(struct kobject *kobj,
+			     struct bin_attribute *bin_attr,
+			     char *buf, loff_t off, size_t count)
 {
 	struct class_device *cdev = container_of(kobj,struct class_device,kobj);
 	struct Scsi_Host *shost = class_to_shost(cdev);
@@ -3327,6 +3331,7 @@ static int ipr_free_dump(struct ipr_ioa_cfg *ioa_cfg)
 /**
  * ipr_write_dump - Setup dump state of adapter
  * @kobj:		kobject struct
+ * @bin_attr:		bin_attribute struct
  * @buf:		buffer
  * @off:		offset
  * @count:		buffer size
@@ -3334,8 +3339,9 @@ static int ipr_free_dump(struct ipr_ioa_cfg *ioa_cfg)
  * Return value:
  *	number of bytes printed to buffer
  **/
-static ssize_t ipr_write_dump(struct kobject *kobj, char *buf,
-			      loff_t off, size_t count)
+static ssize_t ipr_write_dump(struct kobject *kobj,
+			      struct bin_attribute *bin_attr,
+			      char *buf, loff_t off, size_t count)
 {
 	struct class_device *cdev = container_of(kobj,struct class_device,kobj);
 	struct Scsi_Host *shost = class_to_shost(cdev);
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 578ed79f4148..23e90c5f8f35 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -38,8 +38,10 @@ static int sas_disable_routing(struct domain_device *dev,  u8 *sas_addr);
 
 #if 0
 /* FIXME: smp needs to migrate into the sas class */
-static ssize_t smp_portal_read(struct kobject *, char *, loff_t, size_t);
-static ssize_t smp_portal_write(struct kobject *, char *, loff_t, size_t);
+static ssize_t smp_portal_read(struct kobject *, struct bin_attribute *,
+			       char *, loff_t, size_t);
+static ssize_t smp_portal_write(struct kobject *, struct bin_attribute *,
+				char *, loff_t, size_t);
 #endif
 
 /* ---------- SMP task management ---------- */
@@ -1845,8 +1847,9 @@ out:
 #if 0
 /* ---------- SMP portal ---------- */
 
-static ssize_t smp_portal_write(struct kobject *kobj, char *buf, loff_t offs,
-				size_t size)
+static ssize_t smp_portal_write(struct kobject *kobj,
+				struct bin_attribute *bin_attr,
+				char *buf, loff_t offs, size_t size)
 {
 	struct domain_device *dev = to_dom_device(kobj);
 	struct expander_device *ex = &dev->ex_dev;
@@ -1872,8 +1875,9 @@ static ssize_t smp_portal_write(struct kobject *kobj, char *buf, loff_t offs,
 	return size;
 }
 
-static ssize_t smp_portal_read(struct kobject *kobj, char *buf, loff_t offs,
-			       size_t size)
+static ssize_t smp_portal_read(struct kobject *kobj,
+			       struct bin_attribute *bin_attr,
+			       char *buf, loff_t offs, size_t size)
 {
 	struct domain_device *dev = to_dom_device(kobj);
 	struct expander_device *ex = &dev->ex_dev;
diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index f81fe501a4a1..5dfda9778c80 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -1133,7 +1133,8 @@ struct class_device_attribute *lpfc_host_attrs[] = {
 };
 
 static ssize_t
-sysfs_ctlreg_write(struct kobject *kobj, char *buf, loff_t off, size_t count)
+sysfs_ctlreg_write(struct kobject *kobj, struct bin_attribute *bin_attr,
+		   char *buf, loff_t off, size_t count)
 {
 	size_t buf_off;
 	struct Scsi_Host *host = class_to_shost(container_of(kobj,
@@ -1165,7 +1166,8 @@ sysfs_ctlreg_write(struct kobject *kobj, char *buf, loff_t off, size_t count)
 }
 
 static ssize_t
-sysfs_ctlreg_read(struct kobject *kobj, char *buf, loff_t off, size_t count)
+sysfs_ctlreg_read(struct kobject *kobj, struct bin_attribute *bin_attr,
+		  char *buf, loff_t off, size_t count)
 {
 	size_t buf_off;
 	uint32_t * tmp_ptr;
@@ -1221,7 +1223,8 @@ sysfs_mbox_idle (struct lpfc_hba * phba)
 }
 
 static ssize_t
-sysfs_mbox_write(struct kobject *kobj, char *buf, loff_t off, size_t count)
+sysfs_mbox_write(struct kobject *kobj, struct bin_attribute *bin_attr,
+		 char *buf, loff_t off, size_t count)
 {
 	struct Scsi_Host * host =
 		class_to_shost(container_of(kobj, struct class_device, kobj));
@@ -1273,7 +1276,8 @@ sysfs_mbox_write(struct kobject *kobj, char *buf, loff_t off, size_t count)
 }
 
 static ssize_t
-sysfs_mbox_read(struct kobject *kobj, char *buf, loff_t off, size_t count)
+sysfs_mbox_read(struct kobject *kobj, struct bin_attribute *bin_attr,
+		char *buf, loff_t off, size_t count)
 {
 	struct Scsi_Host *host =
 		class_to_shost(container_of(kobj, struct class_device,
diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c
index 96587253bfa9..942db9de785e 100644
--- a/drivers/scsi/qla2xxx/qla_attr.c
+++ b/drivers/scsi/qla2xxx/qla_attr.c
@@ -11,8 +11,9 @@
 /* SYSFS attributes --------------------------------------------------------- */
 
 static ssize_t
-qla2x00_sysfs_read_fw_dump(struct kobject *kobj, char *buf, loff_t off,
-    size_t count)
+qla2x00_sysfs_read_fw_dump(struct kobject *kobj,
+			   struct bin_attribute *bin_attr,
+			   char *buf, loff_t off, size_t count)
 {
 	struct scsi_qla_host *ha = to_qla_host(dev_to_shost(container_of(kobj,
 	    struct device, kobj)));
@@ -31,8 +32,9 @@ qla2x00_sysfs_read_fw_dump(struct kobject *kobj, char *buf, loff_t off,
 }
 
 static ssize_t
-qla2x00_sysfs_write_fw_dump(struct kobject *kobj, char *buf, loff_t off,
-    size_t count)
+qla2x00_sysfs_write_fw_dump(struct kobject *kobj,
+			    struct bin_attribute *bin_attr,
+			    char *buf, loff_t off, size_t count)
 {
 	struct scsi_qla_host *ha = to_qla_host(dev_to_shost(container_of(kobj,
 	    struct device, kobj)));
@@ -80,8 +82,9 @@ static struct bin_attribute sysfs_fw_dump_attr = {
 };
 
 static ssize_t
-qla2x00_sysfs_read_nvram(struct kobject *kobj, char *buf, loff_t off,
-    size_t count)
+qla2x00_sysfs_read_nvram(struct kobject *kobj,
+			 struct bin_attribute *bin_attr,
+			 char *buf, loff_t off, size_t count)
 {
 	struct scsi_qla_host *ha = to_qla_host(dev_to_shost(container_of(kobj,
 	    struct device, kobj)));
@@ -100,8 +103,9 @@ qla2x00_sysfs_read_nvram(struct kobject *kobj, char *buf, loff_t off,
 }
 
 static ssize_t
-qla2x00_sysfs_write_nvram(struct kobject *kobj, char *buf, loff_t off,
-    size_t count)
+qla2x00_sysfs_write_nvram(struct kobject *kobj,
+			  struct bin_attribute *bin_attr,
+			  char *buf, loff_t off, size_t count)
 {
 	struct scsi_qla_host *ha = to_qla_host(dev_to_shost(container_of(kobj,
 	    struct device, kobj)));
@@ -155,8 +159,9 @@ static struct bin_attribute sysfs_nvram_attr = {
 };
 
 static ssize_t
-qla2x00_sysfs_read_optrom(struct kobject *kobj, char *buf, loff_t off,
-    size_t count)
+qla2x00_sysfs_read_optrom(struct kobject *kobj,
+			  struct bin_attribute *bin_attr,
+			  char *buf, loff_t off, size_t count)
 {
 	struct scsi_qla_host *ha = to_qla_host(dev_to_shost(container_of(kobj,
 	    struct device, kobj)));
@@ -174,8 +179,9 @@ qla2x00_sysfs_read_optrom(struct kobject *kobj, char *buf, loff_t off,
 }
 
 static ssize_t
-qla2x00_sysfs_write_optrom(struct kobject *kobj, char *buf, loff_t off,
-    size_t count)
+qla2x00_sysfs_write_optrom(struct kobject *kobj,
+			   struct bin_attribute *bin_attr,
+			   char *buf, loff_t off, size_t count)
 {
 	struct scsi_qla_host *ha = to_qla_host(dev_to_shost(container_of(kobj,
 	    struct device, kobj)));
@@ -203,8 +209,9 @@ static struct bin_attribute sysfs_optrom_attr = {
 };
 
 static ssize_t
-qla2x00_sysfs_write_optrom_ctl(struct kobject *kobj, char *buf, loff_t off,
-    size_t count)
+qla2x00_sysfs_write_optrom_ctl(struct kobject *kobj,
+			       struct bin_attribute *bin_attr,
+			       char *buf, loff_t off, size_t count)
 {
 	struct scsi_qla_host *ha = to_qla_host(dev_to_shost(container_of(kobj,
 	    struct device, kobj)));
@@ -282,8 +289,9 @@ static struct bin_attribute sysfs_optrom_ctl_attr = {
 };
 
 static ssize_t
-qla2x00_sysfs_read_vpd(struct kobject *kobj, char *buf, loff_t off,
-    size_t count)
+qla2x00_sysfs_read_vpd(struct kobject *kobj,
+		       struct bin_attribute *bin_attr,
+		       char *buf, loff_t off, size_t count)
 {
 	struct scsi_qla_host *ha = to_qla_host(dev_to_shost(container_of(kobj,
 	    struct device, kobj)));
@@ -301,8 +309,9 @@ qla2x00_sysfs_read_vpd(struct kobject *kobj, char *buf, loff_t off,
 }
 
 static ssize_t
-qla2x00_sysfs_write_vpd(struct kobject *kobj, char *buf, loff_t off,
-    size_t count)
+qla2x00_sysfs_write_vpd(struct kobject *kobj,
+			struct bin_attribute *bin_attr,
+			char *buf, loff_t off, size_t count)
 {
 	struct scsi_qla_host *ha = to_qla_host(dev_to_shost(container_of(kobj,
 	    struct device, kobj)));
@@ -330,8 +339,9 @@ static struct bin_attribute sysfs_vpd_attr = {
 };
 
 static ssize_t
-qla2x00_sysfs_read_sfp(struct kobject *kobj, char *buf, loff_t off,
-    size_t count)
+qla2x00_sysfs_read_sfp(struct kobject *kobj,
+		       struct bin_attribute *bin_attr,
+		       char *buf, loff_t off, size_t count)
 {
 	struct scsi_qla_host *ha = to_qla_host(dev_to_shost(container_of(kobj,
 	    struct device, kobj)));
diff --git a/drivers/spi/at25.c b/drivers/spi/at25.c
index fde1dededba3..e007833cca59 100644
--- a/drivers/spi/at25.c
+++ b/drivers/spi/at25.c
@@ -111,7 +111,8 @@ at25_ee_read(
 }
 
 static ssize_t
-at25_bin_read(struct kobject *kobj, char *buf, loff_t off, size_t count)
+at25_bin_read(struct kobject *kobj, struct bin_attribute *bin_attr,
+	      char *buf, loff_t off, size_t count)
 {
 	struct device		*dev;
 	struct at25_data	*at25;
@@ -236,7 +237,8 @@ at25_ee_write(struct at25_data *at25, char *buf, loff_t off, size_t count)
 }
 
 static ssize_t
-at25_bin_write(struct kobject *kobj, char *buf, loff_t off, size_t count)
+at25_bin_write(struct kobject *kobj, struct bin_attribute *bin_attr,
+	       char *buf, loff_t off, size_t count)
 {
 	struct device		*dev;
 	struct at25_data	*at25;
diff --git a/drivers/video/aty/radeon_base.c b/drivers/video/aty/radeon_base.c
index 3b3c6571f583..2349e71b0083 100644
--- a/drivers/video/aty/radeon_base.c
+++ b/drivers/video/aty/radeon_base.c
@@ -2102,7 +2102,9 @@ static ssize_t radeon_show_one_edid(char *buf, loff_t off, size_t count, const u
 }
 
 
-static ssize_t radeon_show_edid1(struct kobject *kobj, char *buf, loff_t off, size_t count)
+static ssize_t radeon_show_edid1(struct kobject *kobj,
+				 struct bin_attribute *bin_attr,
+				 char *buf, loff_t off, size_t count)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
 	struct pci_dev *pdev = to_pci_dev(dev);
@@ -2113,7 +2115,9 @@ static ssize_t radeon_show_edid1(struct kobject *kobj, char *buf, loff_t off, si
 }
 
 
-static ssize_t radeon_show_edid2(struct kobject *kobj, char *buf, loff_t off, size_t count)
+static ssize_t radeon_show_edid2(struct kobject *kobj,
+				 struct bin_attribute *bin_attr,
+				 char *buf, loff_t off, size_t count)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
 	struct pci_dev *pdev = to_pci_dev(dev);
diff --git a/drivers/w1/slaves/w1_ds2433.c b/drivers/w1/slaves/w1_ds2433.c
index 4e13aa71adea..cab56005dd49 100644
--- a/drivers/w1/slaves/w1_ds2433.c
+++ b/drivers/w1/slaves/w1_ds2433.c
@@ -91,8 +91,9 @@ static int w1_f23_refresh_block(struct w1_slave *sl, struct w1_f23_data *data,
 }
 #endif	/* CONFIG_W1_SLAVE_DS2433_CRC */
 
-static ssize_t w1_f23_read_bin(struct kobject *kobj, char *buf, loff_t off,
-			       size_t count)
+static ssize_t w1_f23_read_bin(struct kobject *kobj,
+			       struct bin_attribute *bin_attr,
+			       char *buf, loff_t off, size_t count)
 {
 	struct w1_slave *sl = kobj_to_w1_slave(kobj);
 #ifdef CONFIG_W1_SLAVE_DS2433_CRC
@@ -199,8 +200,9 @@ static int w1_f23_write(struct w1_slave *sl, int addr, int len, const u8 *data)
 	return 0;
 }
 
-static ssize_t w1_f23_write_bin(struct kobject *kobj, char *buf, loff_t off,
-				size_t count)
+static ssize_t w1_f23_write_bin(struct kobject *kobj,
+				struct bin_attribute *bin_attr,
+				char *buf, loff_t off, size_t count)
 {
 	struct w1_slave *sl = kobj_to_w1_slave(kobj);
 	int addr, len, idx;
diff --git a/drivers/w1/slaves/w1_therm.c b/drivers/w1/slaves/w1_therm.c
index 8ba4e572e09c..4318935678c5 100644
--- a/drivers/w1/slaves/w1_therm.c
+++ b/drivers/w1/slaves/w1_therm.c
@@ -42,7 +42,8 @@ static u8 bad_roms[][9] = {
 				{}
 			};
 
-static ssize_t w1_therm_read_bin(struct kobject *, char *, loff_t, size_t);
+static ssize_t w1_therm_read_bin(struct kobject *, struct bin_attribute *,
+				 char *, loff_t, size_t);
 
 static struct bin_attribute w1_therm_bin_attr = {
 	.attr = {
@@ -158,7 +159,9 @@ static int w1_therm_check_rom(u8 rom[9])
 	return 0;
 }
 
-static ssize_t w1_therm_read_bin(struct kobject *kobj, char *buf, loff_t off, size_t count)
+static ssize_t w1_therm_read_bin(struct kobject *kobj,
+				 struct bin_attribute *bin_attr,
+				 char *buf, loff_t off, size_t count)
 {
 	struct w1_slave *sl = kobj_to_w1_slave(kobj);
 	struct w1_master *dev = sl->master;
diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c
index 1838cb29b646..f5c5b760ed7b 100644
--- a/drivers/w1/w1.c
+++ b/drivers/w1/w1.c
@@ -105,7 +105,9 @@ static ssize_t w1_slave_read_name(struct device *dev, struct device_attribute *a
 	return sprintf(buf, "%s\n", sl->name);
 }
 
-static ssize_t w1_slave_read_id(struct kobject *kobj, char *buf, loff_t off, size_t count)
+static ssize_t w1_slave_read_id(struct kobject *kobj,
+				struct bin_attribute *bin_attr,
+				char *buf, loff_t off, size_t count)
 {
 	struct w1_slave *sl = kobj_to_w1_slave(kobj);
 
@@ -135,7 +137,9 @@ static struct bin_attribute w1_slave_attr_bin_id = {
 
 /* Default family */
 
-static ssize_t w1_default_write(struct kobject *kobj, char *buf, loff_t off, size_t count)
+static ssize_t w1_default_write(struct kobject *kobj,
+				struct bin_attribute *bin_attr,
+				char *buf, loff_t off, size_t count)
 {
 	struct w1_slave *sl = kobj_to_w1_slave(kobj);
 
@@ -152,7 +156,9 @@ out_up:
 	return count;
 }
 
-static ssize_t w1_default_read(struct kobject *kobj, char *buf, loff_t off, size_t count)
+static ssize_t w1_default_read(struct kobject *kobj,
+			       struct bin_attribute *bin_attr,
+			       char *buf, loff_t off, size_t count)
 {
 	struct w1_slave *sl = kobj_to_w1_slave(kobj);
 
diff --git a/drivers/zorro/zorro-sysfs.c b/drivers/zorro/zorro-sysfs.c
index 7e03cc68b182..9130f1c12c26 100644
--- a/drivers/zorro/zorro-sysfs.c
+++ b/drivers/zorro/zorro-sysfs.c
@@ -49,8 +49,9 @@ static ssize_t zorro_show_resource(struct device *dev, struct device_attribute *
 
 static DEVICE_ATTR(resource, S_IRUGO, zorro_show_resource, NULL);
 
-static ssize_t zorro_read_config(struct kobject *kobj, char *buf, loff_t off,
-				 size_t count)
+static ssize_t zorro_read_config(struct kobject *kobj,
+				 struct bin_attribute *bin_attr,
+				 char *buf, loff_t off, size_t count)
 {
 	struct zorro_dev *z = to_zorro_dev(container_of(kobj, struct device,
 					   kobj));
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 55796bdacd3d..135353f8a296 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -40,7 +40,7 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
 
 	rc = -EIO;
 	if (attr->read)
-		rc = attr->read(kobj, buffer, off, count);
+		rc = attr->read(kobj, attr, buffer, off, count);
 
 	sysfs_put_active_two(attr_sd);
 
@@ -97,7 +97,7 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
 
 	rc = -EIO;
 	if (attr->write)
-		rc = attr->write(kobj, buffer, offset, count);
+		rc = attr->write(kobj, attr, buffer, offset, count);
 
 	sysfs_put_active_two(attr_sd);
 
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 2f58ca1af770..be8228e50a27 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -64,8 +64,10 @@ struct bin_attribute {
 	struct attribute	attr;
 	size_t			size;
 	void			*private;
-	ssize_t (*read)(struct kobject *, char *, loff_t, size_t);
-	ssize_t (*write)(struct kobject *, char *, loff_t, size_t);
+	ssize_t (*read)(struct kobject *, struct bin_attribute *,
+			char *, loff_t, size_t);
+	ssize_t (*write)(struct kobject *, struct bin_attribute *,
+			 char *, loff_t, size_t);
 	int (*mmap)(struct kobject *, struct bin_attribute *attr,
 		    struct vm_area_struct *vma);
 };
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 31ace23a0914..4f42263e0a8a 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -360,8 +360,9 @@ static struct attribute_group bridge_group = {
  *
  * Returns the number of bytes read.
  */
-static ssize_t brforward_read(struct kobject *kobj, char *buf,
-			   loff_t off, size_t count)
+static ssize_t brforward_read(struct kobject *kobj,
+			      struct bin_attribute *bin_attr,
+			      char *buf, loff_t off, size_t count)
 {
 	struct device *dev = to_dev(kobj);
 	struct net_bridge *br = to_bridge(dev);
-- 
cgit v1.3-8-gc7d7


From 29578624e354f56143d92510fff33a8b2aaa2c03 Mon Sep 17 00:00:00 2001
From: Olaf Kirch <olaf.kirch@oracle.com>
Date: Wed, 11 Jul 2007 19:32:02 -0700
Subject: [NET]: Fix races in net_rx_action vs netpoll.

Keep netpoll/poll_napi from messing with the poll_list.
Only net_rx_action is allowed to manipulate the list.

Signed-off-by: Olaf Kirch <olaf.kirch@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 10 ++++++++++
 net/core/netpoll.c        |  8 ++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8590d685d935..79cc3dab4be7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -261,6 +261,8 @@ enum netdev_state_t
 	__LINK_STATE_LINKWATCH_PENDING,
 	__LINK_STATE_DORMANT,
 	__LINK_STATE_QDISC_RUNNING,
+	/* Set by the netpoll NAPI code */
+	__LINK_STATE_POLL_LIST_FROZEN,
 };
 
 
@@ -1014,6 +1016,14 @@ static inline void netif_rx_complete(struct net_device *dev)
 {
 	unsigned long flags;
 
+#ifdef CONFIG_NETPOLL
+	/* Prevent race with netpoll - yes, this is a kludge.
+	 * But at least it doesn't penalize the non-netpoll
+	 * code path. */
+	if (test_bit(__LINK_STATE_POLL_LIST_FROZEN, &dev->state))
+		return;
+#endif
+
 	local_irq_save(flags);
 	__netif_rx_complete(dev);
 	local_irq_restore(flags);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index de1b26aa5720..d1264e9a50a8 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -124,6 +124,13 @@ static void poll_napi(struct netpoll *np)
 	if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) &&
 	    npinfo->poll_owner != smp_processor_id() &&
 	    spin_trylock(&npinfo->poll_lock)) {
+		/* When calling dev->poll from poll_napi, we may end up in
+		 * netif_rx_complete. However, only the CPU to which the
+		 * device was queued is allowed to remove it from poll_list.
+		 * Setting POLL_LIST_FROZEN tells netif_rx_complete
+		 * to leave the NAPI state alone.
+		 */
+		set_bit(__LINK_STATE_POLL_LIST_FROZEN, &np->dev->state);
 		npinfo->rx_flags |= NETPOLL_RX_DROP;
 		atomic_inc(&trapped);
 
@@ -131,6 +138,7 @@ static void poll_napi(struct netpoll *np)
 
 		atomic_dec(&trapped);
 		npinfo->rx_flags &= ~NETPOLL_RX_DROP;
+		clear_bit(__LINK_STATE_POLL_LIST_FROZEN, &np->dev->state);
 		spin_unlock(&npinfo->poll_lock);
 	}
 }
-- 
cgit v1.3-8-gc7d7


From 8c979c26a0f093c13290320edda799d8335e50ae Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 11 Jul 2007 19:45:24 -0700
Subject: [VLAN]: Fix MAC address handling

The VLAN MAC address handling is broken in multiple ways. When the address
differs when setting it, the real device is put in promiscous mode twice,
but never taken out again. Additionally it doesn't resync when the real
device's address is changed and needlessly puts it in promiscous mode when
the vlan device is still down.

Fix by moving address handling to vlan_dev_open/vlan_dev_stop and properly
deal with address changes in the device notifier. Also switch to
dev_unicast_add (which needs the exact same handling).

Since the set_mac_address handler is identical to the generic ethernet one
with these changes, kill it and use ether_setup().

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h |  1 +
 net/8021q/vlan.c        | 43 ++++++++++++++++++++++++++++++++-----
 net/8021q/vlan.h        |  1 -
 net/8021q/vlan_dev.c    | 57 ++++++++++++++++---------------------------------
 4 files changed, 57 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index c7912876a210..61a57dc2ac99 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -135,6 +135,7 @@ struct vlan_dev_info {
 	int old_allmulti;               /* similar to above. */
 	int old_promiscuity;            /* similar to above. */
 	struct net_device *real_dev;    /* the underlying device/interface */
+	unsigned char real_dev_addr[ETH_ALEN];
 	struct proc_dir_entry *dent;    /* Holds the proc data */
 	unsigned long cnt_inc_headroom_on_tx; /* How many times did we have to grow the skb on TX. */
 	unsigned long cnt_encap_on_xmit;      /* How many times did we have to encapsulate the skb on TX. */
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index e7583eea6fda..b463ba47024d 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -345,12 +345,8 @@ static int vlan_dev_init(struct net_device *dev)
 					  (1<<__LINK_STATE_DORMANT))) |
 		      (1<<__LINK_STATE_PRESENT);
 
-	/* TODO: maybe just assign it to be ETHERNET? */
-	dev->type = real_dev->type;
-
 	memcpy(dev->broadcast, real_dev->broadcast, real_dev->addr_len);
 	memcpy(dev->dev_addr, real_dev->dev_addr, real_dev->addr_len);
-	dev->addr_len = real_dev->addr_len;
 
 	if (real_dev->features & NETIF_F_HW_VLAN_TX) {
 		dev->hard_header     = real_dev->hard_header;
@@ -364,6 +360,7 @@ static int vlan_dev_init(struct net_device *dev)
 		dev->rebuild_header  = vlan_dev_rebuild_header;
 	}
 	dev->hard_header_parse = real_dev->hard_header_parse;
+	dev->hard_header_cache = NULL;
 
 	lockdep_set_class(&dev->_xmit_lock, &vlan_netdev_xmit_lock_key);
 	return 0;
@@ -373,6 +370,8 @@ void vlan_setup(struct net_device *new_dev)
 {
 	SET_MODULE_OWNER(new_dev);
 
+	ether_setup(new_dev);
+
 	/* new_dev->ifindex = 0;  it will be set when added to
 	 * the global list.
 	 * iflink is set as well.
@@ -392,7 +391,6 @@ void vlan_setup(struct net_device *new_dev)
 	new_dev->init = vlan_dev_init;
 	new_dev->open = vlan_dev_open;
 	new_dev->stop = vlan_dev_stop;
-	new_dev->set_mac_address = vlan_dev_set_mac_address;
 	new_dev->set_multicast_list = vlan_dev_set_multicast_list;
 	new_dev->destructor = free_netdev;
 	new_dev->do_ioctl = vlan_dev_ioctl;
@@ -592,6 +590,30 @@ out_free_newdev:
 	return err;
 }
 
+static void vlan_sync_address(struct net_device *dev,
+			      struct net_device *vlandev)
+{
+	struct vlan_dev_info *vlan = VLAN_DEV_INFO(vlandev);
+
+	/* May be called without an actual change */
+	if (!compare_ether_addr(vlan->real_dev_addr, dev->dev_addr))
+		return;
+
+	/* vlan address was different from the old address and is equal to
+	 * the new address */
+	if (compare_ether_addr(vlandev->dev_addr, vlan->real_dev_addr) &&
+	    !compare_ether_addr(vlandev->dev_addr, dev->dev_addr))
+		dev_unicast_delete(dev, vlandev->dev_addr, ETH_ALEN);
+
+	/* vlan address was equal to the old address and is different from
+	 * the new address */
+	if (!compare_ether_addr(vlandev->dev_addr, vlan->real_dev_addr) &&
+	    compare_ether_addr(vlandev->dev_addr, dev->dev_addr))
+		dev_unicast_add(dev, vlandev->dev_addr, ETH_ALEN);
+
+	memcpy(vlan->real_dev_addr, dev->dev_addr, ETH_ALEN);
+}
+
 static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr)
 {
 	struct net_device *dev = ptr;
@@ -618,6 +640,17 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 		}
 		break;
 
+	case NETDEV_CHANGEADDR:
+		/* Adjust unicast filters on underlying device */
+		for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+			vlandev = vlan_group_get_device(grp, i);
+			if (!vlandev)
+				continue;
+
+			vlan_sync_address(dev, vlandev);
+		}
+		break;
+
 	case NETDEV_DOWN:
 		/* Put all VLANs for this dev in the down state too.  */
 		for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index fe6bb0f7d275..62ce1c519aab 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -58,7 +58,6 @@ int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev);
 int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, struct net_device *dev);
 int vlan_dev_change_mtu(struct net_device *dev, int new_mtu);
-int vlan_dev_set_mac_address(struct net_device *dev, void* addr);
 int vlan_dev_open(struct net_device* dev);
 int vlan_dev_stop(struct net_device* dev);
 int vlan_dev_ioctl(struct net_device* dev, struct ifreq *ifr, int cmd);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 95afe387b952..d4a62d1b52b4 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -612,44 +612,6 @@ void vlan_dev_get_vid(const struct net_device *dev, unsigned short *result)
 	*result = VLAN_DEV_INFO(dev)->vlan_id;
 }
 
-int vlan_dev_set_mac_address(struct net_device *dev, void *addr_struct_p)
-{
-	struct sockaddr *addr = (struct sockaddr *)(addr_struct_p);
-	int i;
-
-	if (netif_running(dev))
-		return -EBUSY;
-
-	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
-
-	printk("%s: Setting MAC address to ", dev->name);
-	for (i = 0; i < 6; i++)
-		printk(" %2.2x", dev->dev_addr[i]);
-	printk(".\n");
-
-	if (memcmp(VLAN_DEV_INFO(dev)->real_dev->dev_addr,
-		   dev->dev_addr,
-		   dev->addr_len) != 0) {
-		if (!(VLAN_DEV_INFO(dev)->real_dev->flags & IFF_PROMISC)) {
-			int flgs = VLAN_DEV_INFO(dev)->real_dev->flags;
-
-			/* Increment our in-use promiscuity counter */
-			dev_set_promiscuity(VLAN_DEV_INFO(dev)->real_dev, 1);
-
-			/* Make PROMISC visible to the user. */
-			flgs |= IFF_PROMISC;
-			printk("VLAN (%s):  Setting underlying device (%s) to promiscious mode.\n",
-			       dev->name, VLAN_DEV_INFO(dev)->real_dev->name);
-			dev_change_flags(VLAN_DEV_INFO(dev)->real_dev, flgs);
-		}
-	} else {
-		printk("VLAN (%s):  Underlying device (%s) has same MAC, not checking promiscious mode.\n",
-		       dev->name, VLAN_DEV_INFO(dev)->real_dev->name);
-	}
-
-	return 0;
-}
-
 static inline int vlan_dmi_equals(struct dev_mc_list *dmi1,
 				  struct dev_mc_list *dmi2)
 {
@@ -736,15 +698,32 @@ static void vlan_flush_mc_list(struct net_device *dev)
 
 int vlan_dev_open(struct net_device *dev)
 {
-	if (!(VLAN_DEV_INFO(dev)->real_dev->flags & IFF_UP))
+	struct vlan_dev_info *vlan = VLAN_DEV_INFO(dev);
+	struct net_device *real_dev = vlan->real_dev;
+	int err;
+
+	if (!(real_dev->flags & IFF_UP))
 		return -ENETDOWN;
 
+	if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr)) {
+		err = dev_unicast_add(real_dev, dev->dev_addr, ETH_ALEN);
+		if (err < 0)
+			return err;
+	}
+	memcpy(vlan->real_dev_addr, real_dev->dev_addr, ETH_ALEN);
+
 	return 0;
 }
 
 int vlan_dev_stop(struct net_device *dev)
 {
+	struct net_device *real_dev = VLAN_DEV_INFO(dev)->real_dev;
+
 	vlan_flush_mc_list(dev);
+
+	if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr))
+		dev_unicast_delete(real_dev, dev->dev_addr, dev->addr_len);
+
 	return 0;
 }
 
-- 
cgit v1.3-8-gc7d7


From db3d99c090e0cdb34b1274767e062bfddbb384bc Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 11 Jul 2007 19:46:26 -0700
Subject: [NET_SCHED]: ematch: module autoloading

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pkt_cls.h | 17 +++++++----------
 include/net/pkt_cls.h   |  2 ++
 net/sched/em_cmp.c      |  1 +
 net/sched/em_meta.c     |  2 ++
 net/sched/em_nbyte.c    |  2 ++
 net/sched/em_text.c     |  2 ++
 net/sched/em_u32.c      |  2 ++
 net/sched/ematch.c      | 13 +++++++++++++
 8 files changed, 31 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index c3f01b3085a4..30b8571e6b34 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h
@@ -403,16 +403,13 @@ enum
  *   1..32767		Reserved for ematches inside kernel tree
  *   32768..65535	Free to use, not reliable
  */
-enum
-{
-	TCF_EM_CONTAINER,
-	TCF_EM_CMP,
-	TCF_EM_NBYTE,
-	TCF_EM_U32,
-	TCF_EM_META,
-	TCF_EM_TEXT,
-	__TCF_EM_MAX
-};
+#define	TCF_EM_CONTAINER	0
+#define	TCF_EM_CMP		1
+#define	TCF_EM_NBYTE		2
+#define	TCF_EM_U32		3
+#define	TCF_EM_META		4
+#define	TCF_EM_TEXT		5
+#define	TCF_EM_MAX		5
 
 enum
 {
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 4129df708079..6c29920cbe29 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -306,6 +306,8 @@ static inline int tcf_em_tree_match(struct sk_buff *skb,
 		return 1;
 }
 
+#define MODULE_ALIAS_TCF_EMATCH(kind)	MODULE_ALIAS("ematch-kind-" __stringify(kind))
+
 #else /* CONFIG_NET_EMATCH */
 
 struct tcf_ematch_tree
diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c
index 8d6dacd81900..cc49c932641d 100644
--- a/net/sched/em_cmp.c
+++ b/net/sched/em_cmp.c
@@ -98,3 +98,4 @@ MODULE_LICENSE("GPL");
 module_init(init_em_cmp);
 module_exit(exit_em_cmp);
 
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_CMP);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 60acf8cdb27b..650f09c8bd6a 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -848,3 +848,5 @@ MODULE_LICENSE("GPL");
 
 module_init(init_em_meta);
 module_exit(exit_em_meta);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_META);
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
index b4b36efce292..370a1b2ea317 100644
--- a/net/sched/em_nbyte.c
+++ b/net/sched/em_nbyte.c
@@ -76,3 +76,5 @@ MODULE_LICENSE("GPL");
 
 module_init(init_em_nbyte);
 module_exit(exit_em_nbyte);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_NBYTE);
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
index e8f46169449d..d5cd86efb7d0 100644
--- a/net/sched/em_text.c
+++ b/net/sched/em_text.c
@@ -150,3 +150,5 @@ MODULE_LICENSE("GPL");
 
 module_init(init_em_text);
 module_exit(exit_em_text);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_TEXT);
diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c
index 0a2a7fe08de3..112796e4a7c4 100644
--- a/net/sched/em_u32.c
+++ b/net/sched/em_u32.c
@@ -60,3 +60,5 @@ MODULE_LICENSE("GPL");
 
 module_init(init_em_u32);
 module_exit(exit_em_u32);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_U32);
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
index 24837391640d..f3a104e323bd 100644
--- a/net/sched/ematch.c
+++ b/net/sched/ematch.c
@@ -222,6 +222,19 @@ static int tcf_em_validate(struct tcf_proto *tp,
 
 		if (em->ops == NULL) {
 			err = -ENOENT;
+#ifdef CONFIG_KMOD
+			__rtnl_unlock();
+			request_module("ematch-kind-%u", em_hdr->kind);
+			rtnl_lock();
+			em->ops = tcf_em_lookup(em_hdr->kind);
+			if (em->ops) {
+				/* We dropped the RTNL mutex in order to
+				 * perform the module load. Tell the caller
+				 * to replay the request. */
+				module_put(em->ops->owner);
+				err = -EAGAIN;
+			}
+#endif
 			goto errout;
 		}
 
-- 
cgit v1.3-8-gc7d7


From ed0321895182ffb6ecf210e066d87911b270d587 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 28 Jun 2007 15:55:21 -0400
Subject: security: Protection for exploiting null dereference using mmap

Add a new security check on mmap operations to see if the user is attempting
to mmap to low area of the address space.  The amount of space protected is
indicated by the new proc tunable /proc/sys/vm/mmap_min_addr and defaults to
0, preserving existing behavior.

This patch uses a new SELinux security class "memprotect."  Policy already
contains a number of allow rules like a_t self:process * (unconfined_t being
one of them) which mean that putting this check in the process class (its
best current fit) would make it useless as all user processes, which we also
want to protect against, would be allowed. By taking the memprotect name of
the new class it will also make it possible for us to move some of the other
memory protect permissions out of 'process' and into the new class next time
we bump the policy version number (which I also think is a good future idea)

Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 Documentation/sysctl/vm.txt                  | 15 +++++++++++++++
 include/linux/security.h                     | 17 ++++++++++++-----
 kernel/sysctl.c                              | 10 ++++++++++
 mm/mmap.c                                    |  4 ++--
 mm/mremap.c                                  | 13 +++++++++++--
 mm/nommu.c                                   |  2 +-
 security/dummy.c                             |  6 +++++-
 security/security.c                          |  2 ++
 security/selinux/hooks.c                     | 12 ++++++++----
 security/selinux/include/av_perm_to_string.h |  1 +
 security/selinux/include/av_permissions.h    |  1 +
 security/selinux/include/class_to_string.h   |  1 +
 security/selinux/include/flask.h             |  1 +
 13 files changed, 70 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 1d192565e182..8cfca173d4bc 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -31,6 +31,7 @@ Currently, these files are in /proc/sys/vm:
 - min_unmapped_ratio
 - min_slab_ratio
 - panic_on_oom
+- mmap_min_address
 
 ==============================================================
 
@@ -216,3 +217,17 @@ above-mentioned.
 The default value is 0.
 1 and 2 are for failover of clustering. Please select either
 according to your policy of failover.
+
+==============================================================
+
+mmap_min_addr
+
+This file indicates the amount of address space  which a user process will
+be restricted from mmaping.  Since kernel null dereference bugs could
+accidentally operate based on the information in the first couple of pages
+of memory userspace processes should not be allowed to write to them.  By
+default this value is set to 0 and no protections will be enforced by the
+security module.  Setting this value to something like 64k will allow the
+vast majority of applications to work correctly and provide defense in depth
+against future potential kernel bugs.
+
diff --git a/include/linux/security.h b/include/linux/security.h
index 9eb9e0fe0331..c11dc8aa0351 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -71,6 +71,7 @@ struct xfrm_user_sec_ctx;
 extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb);
 extern int cap_netlink_recv(struct sk_buff *skb, int cap);
 
+extern unsigned long mmap_min_addr;
 /*
  * Values used in the task_security_ops calls
  */
@@ -1241,8 +1242,9 @@ struct security_operations {
 	int (*file_ioctl) (struct file * file, unsigned int cmd,
 			   unsigned long arg);
 	int (*file_mmap) (struct file * file,
-			  unsigned long reqprot,
-			  unsigned long prot, unsigned long flags);
+			  unsigned long reqprot, unsigned long prot,
+			  unsigned long flags, unsigned long addr,
+			  unsigned long addr_only);
 	int (*file_mprotect) (struct vm_area_struct * vma,
 			      unsigned long reqprot,
 			      unsigned long prot);
@@ -1814,9 +1816,12 @@ static inline int security_file_ioctl (struct file *file, unsigned int cmd,
 
 static inline int security_file_mmap (struct file *file, unsigned long reqprot,
 				      unsigned long prot,
-				      unsigned long flags)
+				      unsigned long flags,
+				      unsigned long addr,
+				      unsigned long addr_only)
 {
-	return security_ops->file_mmap (file, reqprot, prot, flags);
+	return security_ops->file_mmap (file, reqprot, prot, flags, addr,
+					addr_only);
 }
 
 static inline int security_file_mprotect (struct vm_area_struct *vma,
@@ -2489,7 +2494,9 @@ static inline int security_file_ioctl (struct file *file, unsigned int cmd,
 
 static inline int security_file_mmap (struct file *file, unsigned long reqprot,
 				      unsigned long prot,
-				      unsigned long flags)
+				      unsigned long flags,
+				      unsigned long addr,
+				      unsigned long addr_only)
 {
 	return 0;
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 51f5dac42a00..d93e13d93f24 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -949,6 +949,16 @@ static ctl_table vm_table[] = {
 		.strategy	= &sysctl_jiffies,
 	},
 #endif
+#ifdef CONFIG_SECURITY
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "mmap_min_addr",
+		.data		= &mmap_min_addr,
+		.maxlen         = sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+	},
+#endif
 #if defined(CONFIG_X86_32) || \
    (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
 	{
diff --git a/mm/mmap.c b/mm/mmap.c
index 906ed402f7ca..9f70c8e8c871 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1023,10 +1023,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 		}
 	}
 
-	error = security_file_mmap(file, reqprot, prot, flags);
+	error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
 	if (error)
 		return error;
-		
+
 	/* Clear old maps */
 	error = -ENOMEM;
 munmap_back:
diff --git a/mm/mremap.c b/mm/mremap.c
index 5d4bd4f95b8e..bc7c52efc71b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -291,6 +291,10 @@ unsigned long do_mremap(unsigned long addr,
 		if ((addr <= new_addr) && (addr+old_len) > new_addr)
 			goto out;
 
+		ret = security_file_mmap(0, 0, 0, 0, new_addr, 1);
+		if (ret)
+			goto out;
+
 		ret = do_munmap(mm, new_addr, new_len);
 		if (ret)
 			goto out;
@@ -390,8 +394,13 @@ unsigned long do_mremap(unsigned long addr,
 
 			new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
 						vma->vm_pgoff, map_flags);
-			ret = new_addr;
-			if (new_addr & ~PAGE_MASK)
+			if (new_addr & ~PAGE_MASK) {
+				ret = new_addr;
+				goto out;
+			}
+
+			ret = security_file_mmap(0, 0, 0, 0, new_addr, 1);
+			if (ret)
 				goto out;
 		}
 		ret = move_vma(vma, addr, old_len, new_len, new_addr);
diff --git a/mm/nommu.c b/mm/nommu.c
index 2b16b00a5b11..989e2e9af5c3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -639,7 +639,7 @@ static int validate_mmap_request(struct file *file,
 	}
 
 	/* allow the security API to have its say */
-	ret = security_file_mmap(file, reqprot, prot, flags);
+	ret = security_file_mmap(file, reqprot, prot, flags, addr, 0);
 	if (ret < 0)
 		return ret;
 
diff --git a/security/dummy.c b/security/dummy.c
index 8ffd76405b5b..d6a112ce2975 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -420,8 +420,12 @@ static int dummy_file_ioctl (struct file *file, unsigned int command,
 
 static int dummy_file_mmap (struct file *file, unsigned long reqprot,
 			    unsigned long prot,
-			    unsigned long flags)
+			    unsigned long flags,
+			    unsigned long addr,
+			    unsigned long addr_only)
 {
+	if (addr < mmap_min_addr)
+		return -EACCES;
 	return 0;
 }
 
diff --git a/security/security.c b/security/security.c
index fc8601b2b7ac..024484fc59b0 100644
--- a/security/security.c
+++ b/security/security.c
@@ -24,6 +24,7 @@ extern struct security_operations dummy_security_ops;
 extern void security_fixup_ops(struct security_operations *ops);
 
 struct security_operations *security_ops;	/* Initialized to NULL */
+unsigned long mmap_min_addr;		/* 0 means no protection */
 
 static inline int verify(struct security_operations *ops)
 {
@@ -176,4 +177,5 @@ EXPORT_SYMBOL_GPL(register_security);
 EXPORT_SYMBOL_GPL(unregister_security);
 EXPORT_SYMBOL_GPL(mod_reg_security);
 EXPORT_SYMBOL_GPL(mod_unreg_security);
+EXPORT_SYMBOL_GPL(mmap_min_addr);
 EXPORT_SYMBOL(security_ops);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index b29059ecc045..78c3f98fcdcf 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2569,12 +2569,16 @@ static int file_map_prot_check(struct file *file, unsigned long prot, int shared
 }
 
 static int selinux_file_mmap(struct file *file, unsigned long reqprot,
-			     unsigned long prot, unsigned long flags)
+			     unsigned long prot, unsigned long flags,
+			     unsigned long addr, unsigned long addr_only)
 {
-	int rc;
+	int rc = 0;
+	u32 sid = ((struct task_security_struct*)(current->security))->sid;
 
-	rc = secondary_ops->file_mmap(file, reqprot, prot, flags);
-	if (rc)
+	if (addr < mmap_min_addr)
+		rc = avc_has_perm(sid, sid, SECCLASS_MEMPROTECT,
+				  MEMPROTECT__MMAP_ZERO, NULL);
+	if (rc || addr_only)
 		return rc;
 
 	if (selinux_checkreqprot)
diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h
index b83e74012a97..049bf69429b6 100644
--- a/security/selinux/include/av_perm_to_string.h
+++ b/security/selinux/include/av_perm_to_string.h
@@ -158,3 +158,4 @@
    S_(SECCLASS_KEY, KEY__CREATE, "create")
    S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NODE_BIND, "node_bind")
    S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NAME_CONNECT, "name_connect")
+   S_(SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, "mmap_zero")
diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h
index 5fee1735bffe..eda89a2ec635 100644
--- a/security/selinux/include/av_permissions.h
+++ b/security/selinux/include/av_permissions.h
@@ -823,3 +823,4 @@
 #define DCCP_SOCKET__NAME_BIND                    0x00200000UL
 #define DCCP_SOCKET__NODE_BIND                    0x00400000UL
 #define DCCP_SOCKET__NAME_CONNECT                 0x00800000UL
+#define MEMPROTECT__MMAP_ZERO                     0x00000001UL
diff --git a/security/selinux/include/class_to_string.h b/security/selinux/include/class_to_string.h
index 378799068441..e77de0e62ea0 100644
--- a/security/selinux/include/class_to_string.h
+++ b/security/selinux/include/class_to_string.h
@@ -63,3 +63,4 @@
     S_("key")
     S_(NULL)
     S_("dccp_socket")
+    S_("memprotect")
diff --git a/security/selinux/include/flask.h b/security/selinux/include/flask.h
index 35f309f47873..a9c2b20f14b5 100644
--- a/security/selinux/include/flask.h
+++ b/security/selinux/include/flask.h
@@ -49,6 +49,7 @@
 #define SECCLASS_PACKET                                  57
 #define SECCLASS_KEY                                     58
 #define SECCLASS_DCCP_SOCKET                             60
+#define SECCLASS_MEMPROTECT                              61
 
 /*
  * Security identifier indices for initial entities
-- 
cgit v1.3-8-gc7d7


From d64f73be1b59b9556de0a8fbd4f1a003c6a45a5c Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Thu, 12 Jul 2007 14:12:28 +0200
Subject: i2c: Add kernel documentation

Generate I2C kerneldoc; fix various glitches and add "context" sections to
that documentation.  Most I2C and SMBus functions still have no kerneldoc.

Let me suggest providing kerneldoc for all the i2c_smbus_*() functions as
a small and mostly self-contained project for anyone so inclined.  :)

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 Documentation/DocBook/kernel-api.tmpl | 55 +++++++++++++++++++++++++++++++++++
 drivers/i2c/i2c-core.c                | 13 +++++++++
 include/linux/i2c.h                   | 11 +++++--
 3 files changed, 76 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index 8c5698a8c2e1..46bcff2849bd 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -643,6 +643,60 @@ X!Idrivers/video/console/fonts.c
 !Edrivers/spi/spi.c
   </chapter>
 
+  <chapter id="i2c">
+     <title>I<superscript>2</superscript>C and SMBus Subsystem</title>
+
+     <para>
+	I<superscript>2</superscript>C (or without fancy typography, "I2C")
+	is an acronym for the "Inter-IC" bus, a simple bus protocol which is
+	widely used where low data rate communications suffice.
+	Since it's also a licensed trademark, some vendors use another
+	name (such as "Two-Wire Interface", TWI) for the same bus.
+	I2C only needs two signals (SCL for clock, SDA for data), conserving
+	board real estate and minimizing signal quality issues.
+	Most I2C devices use seven bit addresses, and bus speeds of up
+	to 400 kHz; there's a high speed extension (3.4 MHz) that's not yet
+	found wide use.
+	I2C is a multi-master bus; open drain signaling is used to
+	arbitrate between masters, as well as to handshake and to
+	synchronize clocks from slower clients.
+     </para>
+
+     <para>
+	The Linux I2C programming interfaces support only the master
+	side of bus interactions, not the slave side.
+	The programming interface is structured around two kinds of driver,
+	and two kinds of device.
+	An I2C "Adapter Driver" abstracts the controller hardware; it binds
+	to a physical device (perhaps a PCI device or platform_device) and
+	exposes a <structname>struct i2c_adapter</structname> representing
+	each I2C bus segment it manages.
+	On each I2C bus segment will be I2C devices represented by a
+	<structname>struct i2c_client</structname>.  Those devices will
+	be bound to a <structname>struct i2c_driver</structname>,
+	which should follow the standard Linux driver model.
+	(At this writing, a legacy model is more widely used.)
+	There are functions to perform various I2C protocol operations; at
+	this writing all such functions are usable only from task context.
+     </para>
+
+     <para>
+	The System Management Bus (SMBus) is a sibling protocol.  Most SMBus
+	systems are also I2C conformant.  The electrical constraints are
+	tighter for SMBus, and it standardizes particular protocol messages
+	and idioms.  Controllers that support I2C can also support most
+	SMBus operations, but SMBus controllers don't support all the protocol
+	options that an I2C controller will.
+	There are functions to perform various SMBus protocol operations,
+	either using I2C primitives or by issuing SMBus commands to
+	i2c_adapter devices which don't support those I2C operations.
+     </para>
+
+!Iinclude/linux/i2c.h
+!Fdrivers/i2c/i2c-boardinfo.c i2c_register_board_info
+!Edrivers/i2c/i2c-core.c
+  </chapter>
+
   <chapter id="splice">
       <title>splice API</title>
   <para>)
@@ -654,4 +708,5 @@ X!Idrivers/video/console/fonts.c
 !Ffs/splice.c
   </chapter>
 
+
 </book>
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 435925eba437..cccfa8678245 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -207,6 +207,7 @@ EXPORT_SYMBOL_GPL(i2c_bus_type);
  * i2c_new_device - instantiate an i2c device for use with a new style driver
  * @adap: the adapter managing the device
  * @info: describes one I2C device; bus_num is ignored
+ * Context: can sleep
  *
  * Create a device to work with a new style i2c driver, where binding is
  * handled through driver model probe()/remove() methods.  This call is not
@@ -255,6 +256,7 @@ EXPORT_SYMBOL_GPL(i2c_new_device);
 /**
  * i2c_unregister_device - reverse effect of i2c_new_device()
  * @client: value returned from i2c_new_device()
+ * Context: can sleep
  */
 void i2c_unregister_device(struct i2c_client *client)
 {
@@ -379,6 +381,7 @@ out_list:
 /**
  * i2c_add_adapter - declare i2c adapter, use dynamic bus number
  * @adapter: the adapter to add
+ * Context: can sleep
  *
  * This routine is used to declare an I2C adapter when its bus number
  * doesn't matter.  Examples: for I2C adapters dynamically added by
@@ -416,6 +419,7 @@ EXPORT_SYMBOL(i2c_add_adapter);
 /**
  * i2c_add_numbered_adapter - declare i2c adapter, use static bus number
  * @adap: the adapter to register (with adap->nr initialized)
+ * Context: can sleep
  *
  * This routine is used to declare an I2C adapter when its bus number
  * matters.  Example: for I2C adapters from system-on-chip CPUs, or
@@ -463,6 +467,14 @@ retry:
 }
 EXPORT_SYMBOL_GPL(i2c_add_numbered_adapter);
 
+/**
+ * i2c_del_adapter - unregister I2C adapter
+ * @adap: the adapter being unregistered
+ * Context: can sleep
+ *
+ * This unregisters an I2C adapter which was previously registered
+ * by @i2c_add_adapter or @i2c_add_numbered_adapter.
+ */
 int i2c_del_adapter(struct i2c_adapter *adap)
 {
 	struct list_head  *item, *_n;
@@ -598,6 +610,7 @@ EXPORT_SYMBOL(i2c_register_driver);
 /**
  * i2c_del_driver - unregister I2C driver
  * @driver: the driver being unregistered
+ * Context: can sleep
  */
 void i2c_del_driver(struct i2c_driver *driver)
 {
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index cae7d618030c..a24e267fd189 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -150,15 +150,20 @@ struct i2c_driver {
 
 /**
  * struct i2c_client - represent an I2C slave device
+ * @flags: I2C_CLIENT_TEN indicates the device uses a ten bit chip address;
+ *	I2C_CLIENT_PEC indicates it uses SMBus Packet Error Checking
  * @addr: Address used on the I2C bus connected to the parent adapter.
  * @name: Indicates the type of the device, usually a chip name that's
  *	generic enough to hide second-sourcing and compatible revisions.
+ * @adapter: manages the bus segment hosting this I2C device
  * @dev: Driver model device node for the slave.
+ * @irq: indicates the IRQ generated by this device (if any)
  * @driver_name: Identifies new-style driver used with this device; also
  *	used as the module name for hotplug/coldplug modprobe support.
  *
  * An i2c_client identifies a single device (i.e. chip) connected to an
- * i2c bus. The behaviour is defined by the routines of the driver.
+ * i2c bus. The behaviour exposed to Linux is defined by the driver
+ * managing the device.
  */
 struct i2c_client {
 	unsigned short flags;		/* div., see below		*/
@@ -201,7 +206,7 @@ static inline void i2c_set_clientdata (struct i2c_client *dev, void *data)
  * @addr: stored in i2c_client.addr
  * @platform_data: stored in i2c_client.dev.platform_data
  * @irq: stored in i2c_client.irq
-
+ *
  * I2C doesn't actually support hardware probing, although controllers and
  * devices may be able to use I2C_SMBUS_QUICK to tell whether or not there's
  * a device at a given address.  Drivers commonly need more information than
@@ -210,7 +215,7 @@ static inline void i2c_set_clientdata (struct i2c_client *dev, void *data)
  * i2c_board_info is used to build tables of information listing I2C devices
  * that are present.  This information is used to grow the driver model tree
  * for "new style" I2C drivers.  For mainboards this is done statically using
- * i2c_register_board_info(), where @bus_num represents an adapter that isn't
+ * i2c_register_board_info(); bus numbers identify adapters that aren't
  * yet available.  For add-on boards, i2c_new_device() does this dynamically
  * with the adapter already known.
  */
-- 
cgit v1.3-8-gc7d7


From d75d53cd571c02990d56e72f615ab11e943772f9 Mon Sep 17 00:00:00 2001
From: "Mark M. Hoffman" <mhoffman@lightlink.com>
Date: Thu, 12 Jul 2007 14:12:28 +0200
Subject: i2c: Fix sparse warning in i2c.h

Kill a sparse warning by un-nesting two container_of() calls.

Signed-off-by: Mark M. Hoffman <mhoffman@lightlink.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 include/linux/i2c.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index a24e267fd189..44f2ecf47d9f 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -185,7 +185,8 @@ struct i2c_client {
 
 static inline struct i2c_client *kobj_to_i2c_client(struct kobject *kobj)
 {
-	return to_i2c_client(container_of(kobj, struct device, kobj));
+	struct device * const dev = container_of(kobj, struct device, kobj);
+	return to_i2c_client(dev);
 }
 
 static inline void *i2c_get_clientdata (struct i2c_client *dev)
-- 
cgit v1.3-8-gc7d7


From 4b2643d7d9bdcd776749e17f73c168ddf02e93cb Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Thu, 12 Jul 2007 14:12:29 +0200
Subject: i2c: Fix the i2c_smbus_read_i2c_block_data() prototype

Let the drivers specify how many bytes they want to read with
i2c_smbus_read_i2c_block_data(). So far, the block count was
hard-coded to I2C_SMBUS_BLOCK_MAX (32), which did not make much sense.
Many driver authors complained about this before, and I believe it's
about time to fix it. Right now, authors have to do technically stupid
things, such as individual byte reads or full-fledged I2C messaging,
to work around the problem. We do not want to encourage that.

I even found that some bus drivers (e.g. i2c-amd8111) already
implemented I2C block read the "right" way, that is, they didn't
follow the old, broken standard. The fact that it was never noticed
before just shows how little i2c_smbus_read_i2c_block_data() was used,
which isn't that surprising given how broken its prototype was so far.

There are some obvious compatiblity considerations:
* This changes the i2c_smbus_read_i2c_block_data() prototype. Users
  outside the kernel tree will notice at compilation time, and will
  have to update their code.
* User-space has access to i2c_smbus_xfer() directly using i2c-dev, so
  the changed expectations would affect tools such as i2cdump. In order
  to preserve binary compatibility, we give I2C_SMBUS_I2C_BLOCK_DATA
  a new numeric value, and define I2C_SMBUS_I2C_BLOCK_BROKEN with the
  old numeric value. When i2c-dev receives a transaction with the
  old value, it can convert it to the new format on the fly.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 Documentation/i2c/chips/max6875      |  2 +-
 Documentation/i2c/writing-clients    |  2 +-
 drivers/i2c/busses/i2c-powermac.c    |  3 +--
 drivers/i2c/busses/i2c-viapro.c      |  2 +-
 drivers/i2c/busses/scx200_acb.c      |  2 --
 drivers/i2c/chips/eeprom.c           |  6 ++++--
 drivers/i2c/chips/max6875.c          |  1 +
 drivers/i2c/i2c-core.c               | 12 +++++++-----
 drivers/i2c/i2c-dev.c                |  9 +++++++++
 drivers/macintosh/windfarm_smu_sat.c | 28 ++++------------------------
 include/linux/i2c.h                  |  5 +++--
 11 files changed, 32 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/chips/max6875 b/Documentation/i2c/chips/max6875
index 96fec562a8e9..a0cd8af2f408 100644
--- a/Documentation/i2c/chips/max6875
+++ b/Documentation/i2c/chips/max6875
@@ -99,7 +99,7 @@ And then read the data
 
   or
 
-  count = i2c_smbus_read_i2c_block_data(fd, 0x84, buffer);
+  count = i2c_smbus_read_i2c_block_data(fd, 0x84, 16, buffer);
 
 The block read should read 16 bytes.
 0x84 is the block read command.
diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients
index 3d8d36b0ad12..2c170032bf37 100644
--- a/Documentation/i2c/writing-clients
+++ b/Documentation/i2c/writing-clients
@@ -571,7 +571,7 @@ SMBus communication
                                         u8 command, u8 length,
                                         u8 *values);
   extern s32 i2c_smbus_read_i2c_block_data(struct i2c_client * client,
-                                           u8 command, u8 *values);
+                                           u8 command, u8 length, u8 *values);
 
 These ones were removed in Linux 2.6.10 because they had no users, but could
 be added back later if needed:
diff --git a/drivers/i2c/busses/i2c-powermac.c b/drivers/i2c/busses/i2c-powermac.c
index 1425d2245c82..0ab4f2627c26 100644
--- a/drivers/i2c/busses/i2c-powermac.c
+++ b/drivers/i2c/busses/i2c-powermac.c
@@ -121,8 +121,7 @@ static s32 i2c_powermac_smbus_xfer(	struct i2c_adapter*	adap,
 		if (rc)
 			goto bail;
 		rc = pmac_i2c_xfer(bus, addrdir, 1, command,
-				   read ? data->block : &data->block[1],
-				   data->block[0]);
+				   &data->block[1], data->block[0]);
 		break;
 
         default:
diff --git a/drivers/i2c/busses/i2c-viapro.c b/drivers/i2c/busses/i2c-viapro.c
index 7a2bc06304fc..a0f7e4a303b5 100644
--- a/drivers/i2c/busses/i2c-viapro.c
+++ b/drivers/i2c/busses/i2c-viapro.c
@@ -235,7 +235,7 @@ static s32 vt596_access(struct i2c_adapter *adap, u16 addr,
 		if (!(vt596_features & FEATURE_I2CBLOCK))
 			goto exit_unsupported;
 		if (read_write == I2C_SMBUS_READ)
-			outb_p(I2C_SMBUS_BLOCK_MAX, SMBHSTDAT0);
+			outb_p(data->block[0], SMBHSTDAT0);
 		/* Fall through */
 	case I2C_SMBUS_BLOCK_DATA:
 		outb_p(command, SMBHSTCMD);
diff --git a/drivers/i2c/busses/scx200_acb.c b/drivers/i2c/busses/scx200_acb.c
index dc64f8b6b24a..e6c4a2b762ec 100644
--- a/drivers/i2c/busses/scx200_acb.c
+++ b/drivers/i2c/busses/scx200_acb.c
@@ -310,8 +310,6 @@ static s32 scx200_acb_smbus_xfer(struct i2c_adapter *adapter,
 		break;
 
 	case I2C_SMBUS_I2C_BLOCK_DATA:
-		if (rw == I2C_SMBUS_READ)
-			data->block[0] = I2C_SMBUS_BLOCK_MAX; /* For now */
 		len = data->block[0];
 		if (len == 0 || len > I2C_SMBUS_BLOCK_MAX)
 			return -EINVAL;
diff --git a/drivers/i2c/chips/eeprom.c b/drivers/i2c/chips/eeprom.c
index bfce13c8f1ff..48f857ae8748 100644
--- a/drivers/i2c/chips/eeprom.c
+++ b/drivers/i2c/chips/eeprom.c
@@ -88,8 +88,10 @@ static void eeprom_update_client(struct i2c_client *client, u8 slice)
 		dev_dbg(&client->dev, "Starting eeprom update, slice %u\n", slice);
 
 		if (i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_READ_I2C_BLOCK)) {
-			for (i = slice << 5; i < (slice + 1) << 5; i += I2C_SMBUS_BLOCK_MAX)
-				if (i2c_smbus_read_i2c_block_data(client, i, data->data + i) != I2C_SMBUS_BLOCK_MAX)
+			for (i = slice << 5; i < (slice + 1) << 5; i += 32)
+				if (i2c_smbus_read_i2c_block_data(client, i,
+							32, data->data + i)
+							!= 32)
 					goto exit;
 		} else {
 			if (i2c_smbus_write_byte(client, slice << 5)) {
diff --git a/drivers/i2c/chips/max6875.c b/drivers/i2c/chips/max6875.c
index 76645c142977..e9e9e5171b53 100644
--- a/drivers/i2c/chips/max6875.c
+++ b/drivers/i2c/chips/max6875.c
@@ -106,6 +106,7 @@ static void max6875_update_slice(struct i2c_client *client, int slice)
 					    I2C_FUNC_SMBUS_READ_I2C_BLOCK)) {
 			if (i2c_smbus_read_i2c_block_data(client,
 							  MAX6875_CMD_BLK_READ,
+							  SLICE_SIZE,
 							  buf) != SLICE_SIZE) {
 				goto exit_up;
 			}
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index cccfa8678245..6971a62397db 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -1344,10 +1344,14 @@ s32 i2c_smbus_write_block_data(struct i2c_client *client, u8 command,
 EXPORT_SYMBOL(i2c_smbus_write_block_data);
 
 /* Returns the number of read bytes */
-s32 i2c_smbus_read_i2c_block_data(struct i2c_client *client, u8 command, u8 *values)
+s32 i2c_smbus_read_i2c_block_data(struct i2c_client *client, u8 command,
+				  u8 length, u8 *values)
 {
 	union i2c_smbus_data data;
 
+	if (length > I2C_SMBUS_BLOCK_MAX)
+		length = I2C_SMBUS_BLOCK_MAX;
+	data.block[0] = length;
 	if (i2c_smbus_xfer(client->adapter,client->addr,client->flags,
 	                      I2C_SMBUS_READ,command,
 	                      I2C_SMBUS_I2C_BLOCK_DATA,&data))
@@ -1468,7 +1472,7 @@ static s32 i2c_smbus_xfer_emulated(struct i2c_adapter * adapter, u16 addr,
 		break;
 	case I2C_SMBUS_I2C_BLOCK_DATA:
 		if (read_write == I2C_SMBUS_READ) {
-			msg[1].len = I2C_SMBUS_BLOCK_MAX;
+			msg[1].len = data->block[0];
 		} else {
 			msg[0].len = data->block[0] + 1;
 			if (msg[0].len > I2C_SMBUS_BLOCK_MAX + 1) {
@@ -1524,9 +1528,7 @@ static s32 i2c_smbus_xfer_emulated(struct i2c_adapter * adapter, u16 addr,
 				data->word = msgbuf1[0] | (msgbuf1[1] << 8);
 				break;
 			case I2C_SMBUS_I2C_BLOCK_DATA:
-				/* fixed at 32 for now */
-				data->block[0] = I2C_SMBUS_BLOCK_MAX;
-				for (i = 0; i < I2C_SMBUS_BLOCK_MAX; i++)
+				for (i = 0; i < data->block[0]; i++)
 					data->block[i+1] = msgbuf1[i];
 				break;
 			case I2C_SMBUS_BLOCK_DATA:
diff --git a/drivers/i2c/i2c-dev.c b/drivers/i2c/i2c-dev.c
index e7a709710592..64eee9551b22 100644
--- a/drivers/i2c/i2c-dev.c
+++ b/drivers/i2c/i2c-dev.c
@@ -283,6 +283,7 @@ static int i2cdev_ioctl(struct inode *inode, struct file *file,
 		    (data_arg.size != I2C_SMBUS_WORD_DATA) &&
 		    (data_arg.size != I2C_SMBUS_PROC_CALL) &&
 		    (data_arg.size != I2C_SMBUS_BLOCK_DATA) &&
+		    (data_arg.size != I2C_SMBUS_I2C_BLOCK_BROKEN) &&
 		    (data_arg.size != I2C_SMBUS_I2C_BLOCK_DATA) &&
 		    (data_arg.size != I2C_SMBUS_BLOCK_PROC_CALL)) {
 			dev_dbg(&client->adapter->dev,
@@ -329,10 +330,18 @@ static int i2cdev_ioctl(struct inode *inode, struct file *file,
 
 		if ((data_arg.size == I2C_SMBUS_PROC_CALL) ||
 		    (data_arg.size == I2C_SMBUS_BLOCK_PROC_CALL) ||
+		    (data_arg.size == I2C_SMBUS_I2C_BLOCK_DATA) ||
 		    (data_arg.read_write == I2C_SMBUS_WRITE)) {
 			if (copy_from_user(&temp, data_arg.data, datasize))
 				return -EFAULT;
 		}
+		if (data_arg.size == I2C_SMBUS_I2C_BLOCK_BROKEN) {
+			/* Convert old I2C block commands to the new
+			   convention. This preserves binary compatibility. */
+			data_arg.size = I2C_SMBUS_I2C_BLOCK_DATA;
+			if (data_arg.read_write == I2C_SMBUS_READ)
+				temp.block[0] = I2C_SMBUS_BLOCK_MAX;
+		}
 		res = i2c_smbus_xfer(client->adapter,client->addr,client->flags,
 		      data_arg.read_write,
 		      data_arg.command,data_arg.size,&temp);
diff --git a/drivers/macintosh/windfarm_smu_sat.c b/drivers/macintosh/windfarm_smu_sat.c
index 1043b39aa123..351982bcec1b 100644
--- a/drivers/macintosh/windfarm_smu_sat.c
+++ b/drivers/macintosh/windfarm_smu_sat.c
@@ -67,26 +67,6 @@ static struct i2c_driver wf_sat_driver = {
 	.detach_client	= wf_sat_detach,
 };
 
-/*
- * XXX i2c_smbus_read_i2c_block_data doesn't pass the requested
- * length down to the low-level driver, so we use this, which
- * works well enough with the SMU i2c driver code...
- */
-static int sat_read_block(struct i2c_client *client, u8 command,
-			  u8 *values, int len)
-{
-	union i2c_smbus_data data;
-	int err;
-
-	data.block[0] = len;
-	err = i2c_smbus_xfer(client->adapter, client->addr, client->flags,
-			     I2C_SMBUS_READ, command, I2C_SMBUS_I2C_BLOCK_DATA,
-			     &data);
-	if (!err)
-		memcpy(values, data.block, len);
-	return err;
-}
-
 struct smu_sdbp_header *smu_sat_get_sdb_partition(unsigned int sat_id, int id,
 						  unsigned int *size)
 {
@@ -124,8 +104,8 @@ struct smu_sdbp_header *smu_sat_get_sdb_partition(unsigned int sat_id, int id,
 		return NULL;
 
 	for (i = 0; i < len; i += 4) {
-		err = sat_read_block(&sat->i2c, 0xa, data, 4);
-		if (err) {
+		err = i2c_smbus_read_i2c_block_data(&sat->i2c, 0xa, 4, data);
+		if (err < 0) {
 			printk(KERN_ERR "smu_sat_get_sdb_part rd err %d\n",
 			       err);
 			goto fail;
@@ -157,8 +137,8 @@ static int wf_sat_read_cache(struct wf_sat *sat)
 {
 	int err;
 
-	err = sat_read_block(&sat->i2c, 0x3f, sat->cache, 16);
-	if (err)
+	err = i2c_smbus_read_i2c_block_data(&sat->i2c, 0x3f, 16, sat->cache);
+	if (err < 0)
 		return err;
 	sat->last_read = jiffies;
 #ifdef LOTSA_DEBUG
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 44f2ecf47d9f..2eaba21b9b1a 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -90,7 +90,7 @@ extern s32 i2c_smbus_write_block_data(struct i2c_client * client,
 				      const u8 *values);
 /* Returns the number of read bytes */
 extern s32 i2c_smbus_read_i2c_block_data(struct i2c_client * client,
-					 u8 command, u8 *values);
+					 u8 command, u8 length, u8 *values);
 extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client * client,
 					  u8 command, u8 length,
 					  const u8 *values);
@@ -524,8 +524,9 @@ union i2c_smbus_data {
 #define I2C_SMBUS_WORD_DATA	    3
 #define I2C_SMBUS_PROC_CALL	    4
 #define I2C_SMBUS_BLOCK_DATA	    5
-#define I2C_SMBUS_I2C_BLOCK_DATA    6
+#define I2C_SMBUS_I2C_BLOCK_BROKEN  6
 #define I2C_SMBUS_BLOCK_PROC_CALL   7		/* SMBus 2.0 */
+#define I2C_SMBUS_I2C_BLOCK_DATA    8
 
 
 /* ----- commands for the ioctl like i2c_command call:
-- 
cgit v1.3-8-gc7d7


From c29c22218b99dad95f7cd0281415a854aeee805c Mon Sep 17 00:00:00 2001
From: Henry Su <henry.su@amd.com>
Date: Thu, 12 Jul 2007 14:12:29 +0200
Subject: i2c-piix4: Add support for the ATI SB700

Add the SMBus device ID for ATI SB700.

Signed-off-by: Henry Su <Henry.su@amd.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 Documentation/i2c/busses/i2c-piix4 | 2 +-
 drivers/i2c/busses/Kconfig         | 1 +
 drivers/i2c/busses/i2c-piix4.c     | 4 +++-
 include/linux/pci_ids.h            | 1 +
 4 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/busses/i2c-piix4 b/Documentation/i2c/busses/i2c-piix4
index 7cbe43fa2701..fa0c786a8bf5 100644
--- a/Documentation/i2c/busses/i2c-piix4
+++ b/Documentation/i2c/busses/i2c-piix4
@@ -6,7 +6,7 @@ Supported adapters:
     Datasheet: Publicly available at the Intel website
   * ServerWorks OSB4, CSB5, CSB6 and HT-1000 southbridges
     Datasheet: Only available via NDA from ServerWorks
-  * ATI IXP200, IXP300, IXP400 and SB600 southbridges
+  * ATI IXP200, IXP300, IXP400, SB600 and SB700 southbridges
     Datasheet: Not publicly available
   * Standard Microsystems (SMSC) SLC90E66 (Victory66) southbridge
     Datasheet: Publicly available at the SMSC website http://www.smsc.com
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 838dc1c19d61..165c1266fff7 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -207,6 +207,7 @@ config I2C_PIIX4
 	    ATI IXP300
 	    ATI IXP400
 	    ATI SB600
+	    ATI SB700
 	    Serverworks OSB4
 	    Serverworks CSB5
 	    Serverworks CSB6
diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c
index 5a52bf5e3fb0..debc76cd2161 100644
--- a/drivers/i2c/busses/i2c-piix4.c
+++ b/drivers/i2c/busses/i2c-piix4.c
@@ -23,7 +23,7 @@
    Supports:
 	Intel PIIX4, 440MX
 	Serverworks OSB4, CSB5, CSB6, HT-1000
-	ATI IXP200, IXP300, IXP400, SB600
+	ATI IXP200, IXP300, IXP400, SB600, SB700
 	SMSC Victory66
 
    Note: we assume there can only be one device, with one SMBus interface.
@@ -399,6 +399,8 @@ static struct pci_device_id piix4_ids[] = {
 	  .driver_data = 0 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP600_SMBUS),
 	  .driver_data = 0 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP700_SMBUS),
+	  .driver_data = 0 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_OSB4),
 	  .driver_data = 0 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_CSB5),
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 75c4d4d06892..8300001e9078 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -371,6 +371,7 @@
 #define PCI_DEVICE_ID_ATI_IXP600_SMBUS	0x4385
 #define PCI_DEVICE_ID_ATI_IXP600_IDE	0x438c
 #define PCI_DEVICE_ID_ATI_IXP700_SATA	0x4390
+#define PCI_DEVICE_ID_ATI_IXP700_SMBUS	0x4395
 #define PCI_DEVICE_ID_ATI_IXP700_IDE	0x439c
 
 #define PCI_VENDOR_ID_VLSI		0x1004
-- 
cgit v1.3-8-gc7d7


From b9cdad74883a797952de52464d118d685cafc05a Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Thu, 12 Jul 2007 14:12:31 +0200
Subject: i2c: New bus driver for the TAOS evaluation modules

This is a new I2C bus driver for the TAOS evaluation modules. Developped
and tested on the TAOS TSL2550 EVM.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 Documentation/i2c/busses/i2c-taos-evm |  46 +++++
 drivers/i2c/busses/Kconfig            |  16 ++
 drivers/i2c/busses/Makefile           |   1 +
 drivers/i2c/busses/i2c-taos-evm.c     | 330 ++++++++++++++++++++++++++++++++++
 include/linux/serio.h                 |   1 +
 5 files changed, 394 insertions(+)
 create mode 100644 Documentation/i2c/busses/i2c-taos-evm
 create mode 100644 drivers/i2c/busses/i2c-taos-evm.c

(limited to 'include/linux')

diff --git a/Documentation/i2c/busses/i2c-taos-evm b/Documentation/i2c/busses/i2c-taos-evm
new file mode 100644
index 000000000000..9146e33be6dd
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-taos-evm
@@ -0,0 +1,46 @@
+Kernel driver i2c-taos-evm
+
+Author: Jean Delvare <khali@linux-fr.org>
+
+This is a driver for the evaluation modules for TAOS I2C/SMBus chips.
+The modules include an SMBus master with limited capabilities, which can
+be controlled over the serial port. Virtually all evaluation modules
+are supported, but a few lines of code need to be added for each new
+module to instantiate the right I2C chip on the bus. Obviously, a driver
+for the chip in question is also needed.
+
+Currently supported devices are:
+
+* TAOS TSL2550 EVM
+
+For addtional information on TAOS products, please see
+  http://www.taosinc.com/
+
+
+Using this driver
+-----------------
+
+In order to use this driver, you'll need the serport driver, and the
+inputattach tool, which is part of the input-utils package. The following
+commands will tell the kernel that you have a TAOS EVM on the first
+serial port:
+
+# modprobe serport
+# inputattach --taos-evm /dev/ttyS0
+
+
+Technical details
+-----------------
+
+Only 4 SMBus transaction types are supported by the TAOS evaluation
+modules:
+* Receive Byte
+* Send Byte
+* Read Byte
+* Write Byte
+
+The communication protocol is text-based and pretty simple. It is
+described in a PDF document on the CD which comes with the evaluation
+module. The communication is rather slow, because the serial port has
+to operate at 1200 bps. However, I don't think this is a big concern in
+practice, as these modules are meant for evaluation and testing only.
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 165c1266fff7..c093b0700158 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -513,6 +513,22 @@ config I2C_SIS96X
 	  This driver can also be built as a module.  If so, the module
 	  will be called i2c-sis96x.
 
+config I2C_TAOS_EVM
+	tristate "TAOS evaluation module"
+	depends on EXPERIMENTAL
+	select SERIO
+	select SERIO_SERPORT
+	default n
+	help
+	  This supports TAOS evaluation modules on serial port. In order to
+	  use this driver, you will need the inputattach tool, which is part
+	  of the input-utils package.
+
+	  If unsure, say N.
+
+	  This support is also available as a module.  If so, the module
+	  will be called i2c-taos-evm.
+
 config I2C_STUB
 	tristate "I2C/SMBus Test Stub"
 	depends on EXPERIMENTAL && m
diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
index 14d1432f698b..a49c0a386aee 100644
--- a/drivers/i2c/busses/Makefile
+++ b/drivers/i2c/busses/Makefile
@@ -44,6 +44,7 @@ obj-$(CONFIG_I2C_SIS5595)	+= i2c-sis5595.o
 obj-$(CONFIG_I2C_SIS630)	+= i2c-sis630.o
 obj-$(CONFIG_I2C_SIS96X)	+= i2c-sis96x.o
 obj-$(CONFIG_I2C_STUB)		+= i2c-stub.o
+obj-$(CONFIG_I2C_TAOS_EVM)	+= i2c-taos-evm.o
 obj-$(CONFIG_I2C_TINY_USB)	+= i2c-tiny-usb.o
 obj-$(CONFIG_I2C_VERSATILE)	+= i2c-versatile.o
 obj-$(CONFIG_I2C_ACORN)		+= i2c-acorn.o
diff --git a/drivers/i2c/busses/i2c-taos-evm.c b/drivers/i2c/busses/i2c-taos-evm.c
new file mode 100644
index 000000000000..1b0cfd5472fd
--- /dev/null
+++ b/drivers/i2c/busses/i2c-taos-evm.c
@@ -0,0 +1,330 @@
+/*
+ * Driver for the TAOS evaluation modules
+ * These devices include an I2C master which can be controlled over the
+ * serial port.
+ *
+ * Copyright (C) 2007 Jean Delvare <khali@linux-fr.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/input.h>
+#include <linux/serio.h>
+#include <linux/init.h>
+#include <linux/i2c.h>
+
+#define TAOS_BUFFER_SIZE	63
+
+#define TAOS_STATE_INIT		0
+#define TAOS_STATE_IDLE		1
+#define TAOS_STATE_SEND		2
+#define TAOS_STATE_RECV		3
+
+#define TAOS_CMD_RESET		0x12
+
+static DECLARE_WAIT_QUEUE_HEAD(wq);
+
+struct taos_data {
+	struct i2c_adapter adapter;
+	struct i2c_client *client;
+	int state;
+	u8 addr;		/* last used address */
+	unsigned char buffer[TAOS_BUFFER_SIZE];
+	unsigned int pos;	/* position inside the buffer */
+};
+
+/* TAOS TSL2550 EVM */
+static struct i2c_board_info tsl2550_info = {
+	I2C_BOARD_INFO("tsl2550", 0x39),
+	.type	= "tsl2550",
+};
+
+/* Instantiate i2c devices based on the adapter name */
+static struct i2c_client *taos_instantiate_device(struct i2c_adapter *adapter)
+{
+	if (!strncmp(adapter->name, "TAOS TSL2550 EVM", 16)) {
+		dev_info(&adapter->dev, "Instantiating device %s at 0x%02x\n",
+			tsl2550_info.driver_name, tsl2550_info.addr);
+		return i2c_new_device(adapter, &tsl2550_info);
+	}
+
+	return NULL;
+}
+
+static int taos_smbus_xfer(struct i2c_adapter *adapter, u16 addr,
+			   unsigned short flags, char read_write, u8 command,
+			   int size, union i2c_smbus_data *data)
+{
+	struct serio *serio = adapter->algo_data;
+	struct taos_data *taos = serio_get_drvdata(serio);
+	char *p;
+
+	/* Encode our transaction. "@" is for the device address, "$" for the
+	   SMBus command and "#" for the data. */
+	p = taos->buffer;
+
+	/* The device remembers the last used address, no need to send it
+	   again if it's the same */
+	if (addr != taos->addr)
+		p += sprintf(p, "@%02X", addr);
+
+	switch (size) {
+	case I2C_SMBUS_BYTE:
+		if (read_write == I2C_SMBUS_WRITE)
+			sprintf(p, "$#%02X", command);
+		else
+			sprintf(p, "$");
+		break;
+	case I2C_SMBUS_BYTE_DATA:
+		if (read_write == I2C_SMBUS_WRITE)
+			sprintf(p, "$%02X#%02X", command, data->byte);
+		else
+			sprintf(p, "$%02X", command);
+		break;
+	default:
+		dev_dbg(&adapter->dev, "Unsupported transaction size %d\n",
+			size);
+		return -EINVAL;
+	}
+
+	/* Send the transaction to the TAOS EVM */
+	dev_dbg(&adapter->dev, "Command buffer: %s\n", taos->buffer);
+	taos->pos = 0;
+	taos->state = TAOS_STATE_SEND;
+	serio_write(serio, taos->buffer[0]);
+	wait_event_interruptible_timeout(wq, taos->state == TAOS_STATE_IDLE,
+					 msecs_to_jiffies(250));
+	if (taos->state != TAOS_STATE_IDLE) {
+		dev_err(&adapter->dev, "Transaction failed "
+			"(state=%d, pos=%d)\n", taos->state, taos->pos);
+		taos->addr = 0;
+		return -EIO;
+	}
+	taos->addr = addr;
+
+	/* Start the transaction and read the answer */
+	taos->pos = 0;
+	taos->state = TAOS_STATE_RECV;
+	serio_write(serio, read_write == I2C_SMBUS_WRITE ? '>' : '<');
+	wait_event_interruptible_timeout(wq, taos->state == TAOS_STATE_IDLE,
+					 msecs_to_jiffies(150));
+	if (taos->state != TAOS_STATE_IDLE
+	 || taos->pos != 6) {
+		dev_err(&adapter->dev, "Transaction timeout (pos=%d)\n",
+			taos->pos);
+		return -EIO;
+	}
+	dev_dbg(&adapter->dev, "Answer buffer: %s\n", taos->buffer);
+
+	/* Interpret the returned string */
+	p = taos->buffer + 2;
+	p[3] = '\0';
+	if (!strcmp(p, "NAK"))
+		return -ENODEV;
+
+	if (read_write == I2C_SMBUS_WRITE) {
+		if (!strcmp(p, "ACK"))
+			return 0;
+	} else {
+		if (p[0] == 'x') {
+			data->byte = simple_strtol(p + 1, NULL, 16);
+			return 0;
+		}
+	}
+
+	return -EIO;
+}
+
+static u32 taos_smbus_func(struct i2c_adapter *adapter)
+{
+	return I2C_FUNC_SMBUS_BYTE | I2C_FUNC_SMBUS_BYTE_DATA;
+}
+
+static const struct i2c_algorithm taos_algorithm = {
+	.smbus_xfer	= taos_smbus_xfer,
+	.functionality	= taos_smbus_func,
+};
+
+static irqreturn_t taos_interrupt(struct serio *serio, unsigned char data,
+				  unsigned int flags)
+{
+	struct taos_data *taos = serio_get_drvdata(serio);
+
+	switch (taos->state) {
+	case TAOS_STATE_INIT:
+		taos->buffer[taos->pos++] = data;
+		if (data == ':'
+		 || taos->pos == TAOS_BUFFER_SIZE - 1) {
+			taos->buffer[taos->pos] = '\0';
+			taos->state = TAOS_STATE_IDLE;
+			wake_up_interruptible(&wq);
+		}
+		break;
+	case TAOS_STATE_SEND:
+		if (taos->buffer[++taos->pos])
+			serio_write(serio, taos->buffer[taos->pos]);
+		else {
+			taos->state = TAOS_STATE_IDLE;
+			wake_up_interruptible(&wq);
+		}
+		break;
+	case TAOS_STATE_RECV:
+		taos->buffer[taos->pos++] = data;
+		if (data == ']') {
+			taos->buffer[taos->pos] = '\0';
+			taos->state = TAOS_STATE_IDLE;
+			wake_up_interruptible(&wq);
+		}
+		break;
+	}
+
+	return IRQ_HANDLED;
+}
+
+/* Extract the adapter name from the buffer received after reset.
+   The buffer is modified and a pointer inside the buffer is returned. */
+static char *taos_adapter_name(char *buffer)
+{
+	char *start, *end;
+
+	start = strstr(buffer, "TAOS ");
+	if (!start)
+		return NULL;
+
+	end = strchr(start, '\r');
+	if (!end)
+		return NULL;
+	*end = '\0';
+
+	return start;
+}
+
+static int taos_connect(struct serio *serio, struct serio_driver *drv)
+{
+	struct taos_data *taos;
+	struct i2c_adapter *adapter;
+	char *name;
+	int err;
+
+	taos = kzalloc(sizeof(struct taos_data), GFP_KERNEL);
+	if (!taos) {
+		err = -ENOMEM;
+		goto exit;
+	}
+	taos->state = TAOS_STATE_INIT;
+	serio_set_drvdata(serio, taos);
+
+	err = serio_open(serio, drv);
+	if (err)
+		goto exit_kfree;
+
+	adapter = &taos->adapter;
+	adapter->owner = THIS_MODULE;
+	adapter->algo = &taos_algorithm;
+	adapter->algo_data = serio;
+	adapter->dev.parent = &serio->dev;
+
+	/* Reset the TAOS evaluation module to identify it */
+	serio_write(serio, TAOS_CMD_RESET);
+	wait_event_interruptible_timeout(wq, taos->state == TAOS_STATE_IDLE,
+					 msecs_to_jiffies(2000));
+
+	if (taos->state != TAOS_STATE_IDLE) {
+		err = -ENODEV;
+		dev_dbg(&serio->dev, "TAOS EVM reset failed (state=%d, "
+			"pos=%d)\n", taos->state, taos->pos);
+		goto exit_close;
+	}
+
+	name = taos_adapter_name(taos->buffer);
+	if (!name) {
+		err = -ENODEV;
+		dev_err(&serio->dev, "TAOS EVM identification failed\n");
+		goto exit_close;
+	}
+	strlcpy(adapter->name, name, sizeof(adapter->name));
+
+	err = i2c_add_adapter(adapter);
+	if (err)
+		goto exit_close;
+	dev_dbg(&serio->dev, "Connected to TAOS EVM\n");
+
+	taos->client = taos_instantiate_device(adapter);
+	return 0;
+
+ exit_close:
+	serio_close(serio);
+ exit_kfree:
+	serio_set_drvdata(serio, NULL);
+	kfree(taos);
+ exit:
+	return err;
+}
+
+static void taos_disconnect(struct serio *serio)
+{
+	struct taos_data *taos = serio_get_drvdata(serio);
+
+	if (taos->client)
+		i2c_unregister_device(taos->client);
+	i2c_del_adapter(&taos->adapter);
+	serio_close(serio);
+	serio_set_drvdata(serio, NULL);
+	kfree(taos);
+
+	dev_dbg(&serio->dev, "Disconnected from TAOS EVM\n");
+}
+
+static struct serio_device_id taos_serio_ids[] = {
+	{
+		.type	= SERIO_RS232,
+		.proto	= SERIO_TAOSEVM,
+		.id	= SERIO_ANY,
+		.extra	= SERIO_ANY,
+	},
+	{ 0 }
+};
+MODULE_DEVICE_TABLE(serio, taos_serio_ids);
+
+static struct serio_driver taos_drv = {
+	.driver		= {
+		.name	= "taos-evm",
+	},
+	.description	= "TAOS evaluation module driver",
+	.id_table	= taos_serio_ids,
+	.connect	= taos_connect,
+	.disconnect	= taos_disconnect,
+	.interrupt	= taos_interrupt,
+};
+
+static int __init taos_init(void)
+{
+	return serio_register_driver(&taos_drv);
+}
+
+static void __exit taos_exit(void)
+{
+	serio_unregister_driver(&taos_drv);
+}
+
+MODULE_AUTHOR("Jean Delvare <khali@linux-fr.org>");
+MODULE_DESCRIPTION("TAOS evaluation module driver");
+MODULE_LICENSE("GPL");
+
+module_init(taos_init);
+module_exit(taos_exit);
diff --git a/include/linux/serio.h b/include/linux/serio.h
index 1ebf0455e224..d9377ce9ffd1 100644
--- a/include/linux/serio.h
+++ b/include/linux/serio.h
@@ -209,5 +209,6 @@ static inline void serio_unpin_driver(struct serio *serio)
 #define SERIO_PENMOUNT	0x31
 #define SERIO_TOUCHRIGHT	0x32
 #define SERIO_TOUCHWIN	0x33
+#define SERIO_TAOSEVM	0x34
 
 #endif
-- 
cgit v1.3-8-gc7d7


From e087db510cd96a75a614f6f6fcd5499ab21cb087 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 11 Jul 2007 12:18:31 -0700
Subject: Clean up struct screen_info (<linux/screen_info.h>)

struct screen_info has unaligned members, it needs to be packed.
In the process, fix the naming of some of the members, which don't
belong in this structure but are part of it anyway.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/screen_info.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/screen_info.h b/include/linux/screen_info.h
index b02308ee7667..3ee412bc00ec 100644
--- a/include/linux/screen_info.h
+++ b/include/linux/screen_info.h
@@ -10,7 +10,7 @@
 struct screen_info {
 	u8  orig_x;		/* 0x00 */
 	u8  orig_y;		/* 0x01 */
-	u16 dontuse1;		/* 0x02 -- EXT_MEM_K sits here */
+	u16 ext_mem_k;		/* 0x02 */
 	u16 orig_video_page;	/* 0x04 */
 	u8  orig_video_mode;	/* 0x06 */
 	u8  orig_video_cols;	/* 0x07 */
@@ -27,7 +27,7 @@ struct screen_info {
 	u16 lfb_depth;		/* 0x16 */
 	u32 lfb_base;		/* 0x18 */
 	u32 lfb_size;		/* 0x1c */
-	u16 dontuse2, dontuse3;	/* 0x20 -- CL_MAGIC and CL_OFFSET here */
+	u16 cl_magic, cl_offset; /* 0x20 */
 	u16 lfb_linelength;	/* 0x24 */
 	u8  red_size;		/* 0x26 */
 	u8  red_pos;		/* 0x27 */
@@ -42,9 +42,8 @@ struct screen_info {
 	u16 pages;		/* 0x32 */
 	u16 vesa_attributes;	/* 0x34 */
 	u32 capabilities;       /* 0x36 */
-				/* 0x3a -- 0x3b reserved for future expansion */
-				/* 0x3c -- 0x3f micro stack for relocatable kernels */
-};
+	u8  _reserved[6];	/* 0x3a */
+} __attribute__((packed));
 
 extern struct screen_info screen_info;
 
-- 
cgit v1.3-8-gc7d7


From c39736823232bc3ca113c8228fa852c09fba300e Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 11 Jul 2007 12:18:58 -0700
Subject: Remove old i386 setup code

This removes the old i386 setup code.  This is done as a separate patch
to avoid breaking git bisect as some of the i386 code was also used by
the old x86-64 code.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/boot/bootsect.S     |   98 --
 arch/i386/boot/edd.S          |  231 -----
 arch/i386/boot/setup.S        | 1075 ----------------------
 arch/i386/boot/video.S        | 2043 -----------------------------------------
 arch/i386/kernel/verify_cpu.S |   94 --
 include/linux/edd.h           |    4 -
 6 files changed, 3545 deletions(-)
 delete mode 100644 arch/i386/boot/bootsect.S
 delete mode 100644 arch/i386/boot/edd.S
 delete mode 100644 arch/i386/boot/setup.S
 delete mode 100644 arch/i386/boot/video.S
 delete mode 100644 arch/i386/kernel/verify_cpu.S

(limited to 'include/linux')

diff --git a/arch/i386/boot/bootsect.S b/arch/i386/boot/bootsect.S
deleted file mode 100644
index 011b7a4993d4..000000000000
--- a/arch/i386/boot/bootsect.S
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- *	bootsect.S		Copyright (C) 1991, 1992 Linus Torvalds
- *
- *	modified by Drew Eckhardt
- *	modified by Bruce Evans (bde)
- *	modified by Chris Noe (May 1999) (as86 -> gas)
- *	gutted by H. Peter Anvin (Jan 2003)
- *
- * BIG FAT NOTE: We're in real mode using 64k segments.  Therefore segment
- * addresses must be multiplied by 16 to obtain their respective linear
- * addresses. To avoid confusion, linear addresses are written using leading
- * hex while segment addresses are written as segment:offset.
- *
- */
-
-#include <asm/boot.h>
-
-SETUPSECTS	= 4			/* default nr of setup-sectors */
-BOOTSEG		= 0x07C0		/* original address of boot-sector */
-INITSEG		= DEF_INITSEG		/* we move boot here - out of the way */
-SETUPSEG	= DEF_SETUPSEG		/* setup starts here */
-SYSSEG		= DEF_SYSSEG		/* system loaded at 0x10000 (65536) */
-SYSSIZE		= DEF_SYSSIZE		/* system size: # of 16-byte clicks */
-					/* to be loaded */
-ROOT_DEV	= 0 			/* ROOT_DEV is now written by "build" */
-SWAP_DEV	= 0			/* SWAP_DEV is now written by "build" */
-
-#ifndef SVGA_MODE
-#define SVGA_MODE ASK_VGA
-#endif
-
-#ifndef RAMDISK
-#define RAMDISK 0
-#endif
-
-#ifndef ROOT_RDONLY
-#define ROOT_RDONLY 1
-#endif
-
-.code16
-.text
-
-.global _start
-_start:
-
-	# Normalize the start address
-	jmpl	$BOOTSEG, $start2
-
-start2:
-	movw	%cs, %ax
-	movw	%ax, %ds
-	movw	%ax, %es
-	movw	%ax, %ss
-	movw	$0x7c00, %sp
-	sti
-	cld
-
-	movw	$bugger_off_msg, %si
-
-msg_loop:
-	lodsb
-	andb	%al, %al
-	jz	die
-	movb	$0xe, %ah
-	movw	$7, %bx
-	int	$0x10
-	jmp	msg_loop
-
-die:
-	# Allow the user to press a key, then reboot
-	xorw	%ax, %ax
-	int	$0x16
-	int	$0x19
-
-	# int 0x19 should never return.  In case it does anyway,
-	# invoke the BIOS reset code...
-	ljmp	$0xf000,$0xfff0
-
-
-bugger_off_msg:
-	.ascii	"Direct booting from floppy is no longer supported.\r\n"
-	.ascii	"Please use a boot loader program instead.\r\n"
-	.ascii	"\n"
-	.ascii	"Remove disk and press any key to reboot . . .\r\n"
-	.byte	0
-
-
-	# Kernel attributes; used by setup
-
-	.org 497
-setup_sects:	.byte SETUPSECTS
-root_flags:	.word ROOT_RDONLY
-syssize:	.word SYSSIZE
-swap_dev:	.word SWAP_DEV
-ram_size:	.word RAMDISK
-vid_mode:	.word SVGA_MODE
-root_dev:	.word ROOT_DEV
-boot_flag:	.word 0xAA55
diff --git a/arch/i386/boot/edd.S b/arch/i386/boot/edd.S
deleted file mode 100644
index 34321368011a..000000000000
--- a/arch/i386/boot/edd.S
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * BIOS Enhanced Disk Drive support
- * Copyright (C) 2002, 2003, 2004 Dell, Inc.
- * by Matt Domsch <Matt_Domsch@dell.com> October 2002
- * conformant to T13 Committee www.t13.org
- *   projects 1572D, 1484D, 1386D, 1226DT
- * disk signature read by Matt Domsch <Matt_Domsch@dell.com>
- *	and Andrew Wilks <Andrew_Wilks@dell.com> September 2003, June 2004
- * legacy CHS retrieval by Patrick J. LoPresti <patl@users.sourceforge.net>
- *      March 2004
- * Command line option parsing, Matt Domsch, November 2004
- */
-
-#include <linux/edd.h>
-#include <asm/setup.h>
-
-#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
-
-# It is assumed that %ds == INITSEG here
-
-	movb	$0, (EDD_MBR_SIG_NR_BUF)
-	movb	$0, (EDDNR)
-
-# Check the command line for options:
-# edd=of  disables EDD completely  (edd=off)
-# edd=sk  skips the MBR test    (edd=skipmbr)
-# edd=on  re-enables EDD (edd=on)
-
-	pushl	%esi
-	movw	$edd_mbr_sig_start, %di	# Default to edd=on
-
-	movl	%cs:(cmd_line_ptr), %esi
-	andl	%esi, %esi
-	jz	old_cl			# Old boot protocol?
-
-# Convert to a real-mode pointer in fs:si
-	movl	%esi, %eax
-	shrl	$4, %eax
-	movw	%ax, %fs
-	andw	$0xf, %si
-	jmp	have_cl_pointer
-
-# Old-style boot protocol?
-old_cl:
-	push	%ds			# aka INITSEG
-	pop	%fs
-
-	cmpw	$0xa33f, (0x20)
-	jne	done_cl			# No command line at all?
-	movw	(0x22), %si		# Pointer relative to INITSEG
-
-# fs:si has the pointer to the command line now
-have_cl_pointer:
-
-# Loop through kernel command line one byte at a time.  Just in
-# case the loader is buggy and failed to null-terminate the command line
-# terminate if we get close enough to the end of the segment that we
-# cannot fit "edd=XX"...
-cl_atspace:
-	cmpw	$-5, %si		# Watch for segment wraparound
-	jae	done_cl
-	movl	%fs:(%si), %eax
-	andb	%al, %al		# End of line?
-	jz	done_cl
-	cmpl	$EDD_CL_EQUALS, %eax
-	jz	found_edd_equals
-	cmpb	$0x20, %al		# <= space consider whitespace
-	ja	cl_skipword
-	incw	%si
-	jmp	cl_atspace
-
-cl_skipword:
-	cmpw	$-5, %si		# Watch for segment wraparound
-	jae	done_cl
-	movb	%fs:(%si), %al		# End of string?
-	andb	%al, %al
-	jz	done_cl
-	cmpb	$0x20, %al
-	jbe	cl_atspace
-	incw	%si
-	jmp	cl_skipword
-
-found_edd_equals:
-# only looking at first two characters after equals
-# late overrides early on the command line, so keep going after finding something
-	movw	%fs:4(%si), %ax
-	cmpw	$EDD_CL_OFF, %ax	# edd=of
-	je	do_edd_off
-	cmpw	$EDD_CL_SKIP, %ax	# edd=sk
-	je	do_edd_skipmbr
-	cmpw	$EDD_CL_ON, %ax		# edd=on
-	je	do_edd_on
-	jmp	cl_skipword
-do_edd_skipmbr:
-	movw	$edd_start, %di
-	jmp	cl_skipword
-do_edd_off:
-	movw	$edd_done, %di
-	jmp	cl_skipword
-do_edd_on:
-	movw	$edd_mbr_sig_start, %di
-	jmp	cl_skipword
-
-done_cl:
-	popl	%esi
-	jmpw	*%di
-
-# Read the first sector of each BIOS disk device and store the 4-byte signature
-edd_mbr_sig_start:
-	movb	$0x80, %dl			# from device 80
-	movw	$EDD_MBR_SIG_BUF, %bx		# store buffer ptr in bx
-edd_mbr_sig_read:
-	movl	$0xFFFFFFFF, %eax
-	movl	%eax, (%bx)			# assume failure
-	pushw	%bx
-	movb	$READ_SECTORS, %ah
-	movb	$1, %al				# read 1 sector
-	movb	$0, %dh				# at head 0
-	movw	$1, %cx				# cylinder 0, sector 0
-	pushw	%es
-	pushw	%ds
-	popw	%es
-    	movw	$EDDBUF, %bx			# disk's data goes into EDDBUF
-	pushw	%dx             # work around buggy BIOSes
-	stc                     # work around buggy BIOSes
-	int	$0x13
-	sti                     # work around buggy BIOSes
-	popw	%dx
-	popw	%es
-	popw	%bx
-	jc	edd_mbr_sig_done		# on failure, we're done.
-	cmpb	$0, %ah		# some BIOSes do not set CF
-	jne	edd_mbr_sig_done		# on failure, we're done.
-	movl	(EDDBUF+EDD_MBR_SIG_OFFSET), %eax # read sig out of the MBR
-	movl	%eax, (%bx)			# store success
-	incb	(EDD_MBR_SIG_NR_BUF)		# note that we stored something
-	incb	%dl				# increment to next device
-	addw	$4, %bx				# increment sig buffer ptr
-	cmpb	$EDD_MBR_SIG_MAX, (EDD_MBR_SIG_NR_BUF)	# Out of space?
-	jb	edd_mbr_sig_read		# keep looping
-edd_mbr_sig_done:
-
-# Do the BIOS Enhanced Disk Drive calls
-# This consists of two calls:
-#    int 13h ah=41h "Check Extensions Present"
-#    int 13h ah=48h "Get Device Parameters"
-#    int 13h ah=08h "Legacy Get Device Parameters"
-#
-# A buffer of size EDDMAXNR*(EDDEXTSIZE+EDDPARMSIZE) is reserved for our use
-# in the boot_params at EDDBUF.  The first four bytes of which are
-# used to store the device number, interface support map and version
-# results from fn41.  The next four bytes are used to store the legacy
-# cylinders, heads, and sectors from fn08. The following 74 bytes are used to
-# store the results from fn48.  Starting from device 80h, fn41, then fn48
-# are called and their results stored in EDDBUF+n*(EDDEXTSIZE+EDDPARMIZE).
-# Then the pointer is incremented to store the data for the next call.
-# This repeats until either a device doesn't exist, or until EDDMAXNR
-# devices have been stored.
-# The one tricky part is that ds:si always points EDDEXTSIZE bytes into
-# the structure, and the fn41 and fn08 results are stored at offsets
-# from there.  This removes the need to increment the pointer for
-# every store, and leaves it ready for the fn48 call.
-# A second one-byte buffer, EDDNR, in the boot_params stores
-# the number of BIOS devices which exist, up to EDDMAXNR.
-# In setup.c, copy_edd() stores both boot_params buffers away
-# for later use, as they would get overwritten otherwise.
-# This code is sensitive to the size of the structs in edd.h
-edd_start:
-						# %ds points to the bootsector
-       						# result buffer for fn48
-	movw	$EDDBUF+EDDEXTSIZE, %si		# in ds:si, fn41 results
-						# kept just before that
-	movb	$0x80, %dl			# BIOS device 0x80
-
-edd_check_ext:
-	movb	$CHECKEXTENSIONSPRESENT, %ah    # Function 41
-	movw	$EDDMAGIC1, %bx			# magic
-	int	$0x13				# make the call
-	jc	edd_done			# no more BIOS devices
-
-	cmpw	$EDDMAGIC2, %bx			# is magic right?
-	jne	edd_next			# nope, next...
-
-	movb	%dl, %ds:-8(%si)		# store device number
-	movb	%ah, %ds:-7(%si)		# store version
-	movw	%cx, %ds:-6(%si)		# store extensions
-	incb	(EDDNR)				# note that we stored something
-
-edd_get_device_params:
-	movw	$EDDPARMSIZE, %ds:(%si)		# put size
-	movw	$0x0, %ds:2(%si)		# work around buggy BIOSes
-	movb	$GETDEVICEPARAMETERS, %ah	# Function 48
-	int	$0x13				# make the call
-						# Don't check for fail return
-						# it doesn't matter.
-edd_get_legacy_chs:
-	xorw    %ax, %ax
-	movw    %ax, %ds:-4(%si)
-	movw    %ax, %ds:-2(%si)
-        # Ralf Brown's Interrupt List says to set ES:DI to
-	# 0000h:0000h "to guard against BIOS bugs"
-	pushw   %es
-	movw    %ax, %es
-	movw    %ax, %di
-	pushw   %dx                             # legacy call clobbers %dl
-	movb    $LEGACYGETDEVICEPARAMETERS, %ah # Function 08
-	int     $0x13                           # make the call
-	jc      edd_legacy_done                 # failed
-	movb    %cl, %al                        # Low 6 bits are max
-	andb    $0x3F, %al                      #   sector number
-	movb	%al, %ds:-1(%si)                # Record max sect
-	movb    %dh, %ds:-2(%si)                # Record max head number
-	movb    %ch, %al                        # Low 8 bits of max cyl
-	shr     $6, %cl
-	movb    %cl, %ah                        # High 2 bits of max cyl
-	movw    %ax, %ds:-4(%si)
-
-edd_legacy_done:
-	popw    %dx
-	popw    %es
-	movw	%si, %ax			# increment si
-	addw	$EDDPARMSIZE+EDDEXTSIZE, %ax
-	movw	%ax, %si
-
-edd_next:
-	incb	%dl				# increment to next device
-	cmpb	$EDDMAXNR, (EDDNR) 		# Out of space?
-	jb	edd_check_ext			# keep looping
-
-edd_done:
-#endif
diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S
deleted file mode 100644
index 6dbcc95b2120..000000000000
--- a/arch/i386/boot/setup.S
+++ /dev/null
@@ -1,1075 +0,0 @@
-/*
- *	setup.S		Copyright (C) 1991, 1992 Linus Torvalds
- *
- * setup.s is responsible for getting the system data from the BIOS,
- * and putting them into the appropriate places in system memory.
- * both setup.s and system has been loaded by the bootblock.
- *
- * This code asks the bios for memory/disk/other parameters, and
- * puts them in a "safe" place: 0x90000-0x901FF, ie where the
- * boot-block used to be. It is then up to the protected mode
- * system to read them from there before the area is overwritten
- * for buffer-blocks.
- *
- * Move PS/2 aux init code to psaux.c
- * (troyer@saifr00.cfsat.Honeywell.COM) 03Oct92
- *
- * some changes and additional features by Christoph Niemann,
- * March 1993/June 1994 (Christoph.Niemann@linux.org)
- *
- * add APM BIOS checking by Stephen Rothwell, May 1994
- * (sfr@canb.auug.org.au)
- *
- * High load stuff, initrd support and position independency
- * by Hans Lermen & Werner Almesberger, February 1996
- * <lermen@elserv.ffm.fgan.de>, <almesber@lrc.epfl.ch>
- *
- * Video handling moved to video.S by Martin Mares, March 1996
- * <mj@k332.feld.cvut.cz>
- *
- * Extended memory detection scheme retwiddled by orc@pell.chi.il.us (david
- * parsons) to avoid loadlin confusion, July 1997
- *
- * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999.
- * <stiker@northlink.com>
- *
- * Fix to work around buggy BIOSes which don't use carry bit correctly
- * and/or report extended memory in CX/DX for e801h memory size detection 
- * call.  As a result the kernel got wrong figures.  The int15/e801h docs
- * from Ralf Brown interrupt list seem to indicate AX/BX should be used
- * anyway.  So to avoid breaking many machines (presumably there was a reason
- * to orginally use CX/DX instead of AX/BX), we do a kludge to see
- * if CX/DX have been changed in the e801 call and if so use AX/BX .
- * Michael Miller, April 2001 <michaelm@mjmm.org>
- *
- * New A20 code ported from SYSLINUX by H. Peter Anvin. AMD Elan bugfixes
- * by Robert Schwebel, December 2001 <robert@schwebel.de>
- */
-
-#include <asm/segment.h>
-#include <linux/utsrelease.h>
-#include <linux/compile.h>
-#include <asm/boot.h>
-#include <asm/e820.h>
-#include <asm/page.h>
-#include <asm/setup.h>
-	
-/* Signature words to ensure LILO loaded us right */
-#define SIG1	0xAA55
-#define SIG2	0x5A5A
-
-INITSEG  = DEF_INITSEG		# 0x9000, we move boot here, out of the way
-SYSSEG   = DEF_SYSSEG		# 0x1000, system loaded at 0x10000 (65536).
-SETUPSEG = DEF_SETUPSEG		# 0x9020, this is the current segment
-				# ... and the former contents of CS
-
-DELTA_INITSEG = SETUPSEG - INITSEG	# 0x0020
-
-.code16
-.globl begtext, begdata, begbss, endtext, enddata, endbss
-
-.text
-begtext:
-.data
-begdata:
-.bss
-begbss:
-.text
-
-start:
-	jmp	trampoline
-
-# This is the setup header, and it must start at %cs:2 (old 0x9020:2)
-
-		.ascii	"HdrS"		# header signature
-		.word	0x0206		# header version number (>= 0x0105)
-					# or else old loadlin-1.5 will fail)
-realmode_swtch:	.word	0, 0		# default_switch, SETUPSEG
-start_sys_seg:	.word	SYSSEG
-		.word	kernel_version	# pointing to kernel version string
-					# above section of header is compatible
-					# with loadlin-1.5 (header v1.5). Don't
-					# change it.
-
-type_of_loader:	.byte	0		# = 0, old one (LILO, Loadlin,
-					#      Bootlin, SYSLX, bootsect...)
-					# See Documentation/i386/boot.txt for
-					# assigned ids
-	
-# flags, unused bits must be zero (RFU) bit within loadflags
-loadflags:
-LOADED_HIGH	= 1			# If set, the kernel is loaded high
-CAN_USE_HEAP	= 0x80			# If set, the loader also has set
-					# heap_end_ptr to tell how much
-					# space behind setup.S can be used for
-					# heap purposes.
-					# Only the loader knows what is free
-#ifndef __BIG_KERNEL__
-		.byte	0
-#else
-		.byte	LOADED_HIGH
-#endif
-
-setup_move_size: .word  0x8000		# size to move, when setup is not
-					# loaded at 0x90000. We will move setup 
-					# to 0x90000 then just before jumping
-					# into the kernel. However, only the
-					# loader knows how much data behind
-					# us also needs to be loaded.
-
-code32_start:				# here loaders can put a different
-					# start address for 32-bit code.
-#ifndef __BIG_KERNEL__
-		.long	0x1000		#   0x1000 = default for zImage
-#else
-		.long	0x100000	# 0x100000 = default for big kernel
-#endif
-
-ramdisk_image:	.long	0		# address of loaded ramdisk image
-					# Here the loader puts the 32-bit
-					# address where it loaded the image.
-					# This only will be read by the kernel.
-
-ramdisk_size:	.long	0		# its size in bytes
-
-bootsect_kludge:
-		.long	0		# obsolete
-
-heap_end_ptr:	.word	modelist+1024	# (Header version 0x0201 or later)
-					# space from here (exclusive) down to
-					# end of setup code can be used by setup
-					# for local heap purposes.
-
-pad1:		.word	0
-cmd_line_ptr:	.long 0			# (Header version 0x0202 or later)
-					# If nonzero, a 32-bit pointer
-					# to the kernel command line.
-					# The command line should be
-					# located between the start of
-					# setup and the end of low
-					# memory (0xa0000), or it may
-					# get overwritten before it
-					# gets read.  If this field is
-					# used, there is no longer
-					# anything magical about the
-					# 0x90000 segment; the setup
-					# can be located anywhere in
-					# low memory 0x10000 or higher.
-
-ramdisk_max:	.long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff
-					# (Header version 0x0203 or later)
-					# The highest safe address for
-					# the contents of an initrd
-
-kernel_alignment:  .long CONFIG_PHYSICAL_ALIGN 	#physical addr alignment
-						#required for protected mode
-						#kernel
-#ifdef CONFIG_RELOCATABLE
-relocatable_kernel:    .byte 1
-#else
-relocatable_kernel:    .byte 0
-#endif
-pad2:			.byte 0
-pad3:			.word 0
-
-cmdline_size:   .long   COMMAND_LINE_SIZE-1     #length of the command line,
-                                                #added with boot protocol
-                                                #version 2.06
-
-trampoline:	call	start_of_setup
-		.align 16
-					# The offset at this point is 0x240
-		.space	(0xeff-0x240+1) # E820 & EDD space (ending at 0xeff)
-# End of setup header #####################################################
-
-start_of_setup:
-# Bootlin depends on this being done early
-	movw	$0x01500, %ax
-	movb	$0x81, %dl
-	int	$0x13
-
-#ifdef SAFE_RESET_DISK_CONTROLLER
-# Reset the disk controller.
-	movw	$0x0000, %ax
-	movb	$0x80, %dl
-	int	$0x13
-#endif
-
-# Set %ds = %cs, we know that SETUPSEG = %cs at this point
-	movw	%cs, %ax		# aka SETUPSEG
-	movw	%ax, %ds
-# Check signature at end of setup
-	cmpw	$SIG1, setup_sig1
-	jne	bad_sig
-
-	cmpw	$SIG2, setup_sig2
-	jne	bad_sig
-
-	jmp	good_sig1
-
-# Routine to print asciiz string at ds:si
-prtstr:
-	lodsb
-	andb	%al, %al
-	jz	fin
-
-	call	prtchr
-	jmp	prtstr
-
-fin:	ret
-
-# Space printing
-prtsp2:	call	prtspc		# Print double space
-prtspc:	movb	$0x20, %al	# Print single space (note: fall-thru)
-
-# Part of above routine, this one just prints ascii al
-prtchr:	pushw	%ax
-	pushw	%cx
-	movw	$7,%bx
-	movw	$0x01, %cx
-	movb	$0x0e, %ah
-	int	$0x10
-	popw	%cx
-	popw	%ax
-	ret
-
-beep:	movb	$0x07, %al
-	jmp	prtchr
-	
-no_sig_mess: .string	"No setup signature found ..."
-
-good_sig1:
-	jmp	good_sig
-
-# We now have to find the rest of the setup code/data
-bad_sig:
-	movw	%cs, %ax			# SETUPSEG
-	subw	$DELTA_INITSEG, %ax		# INITSEG
-	movw	%ax, %ds
-	xorb	%bh, %bh
-	movb	(497), %bl			# get setup sect from bootsect
-	subw	$4, %bx				# LILO loads 4 sectors of setup
-	shlw	$8, %bx				# convert to words (1sect=2^8 words)
-	movw	%bx, %cx
-	shrw	$3, %bx				# convert to segment
-	addw	$SYSSEG, %bx
-	movw	%bx, %cs:start_sys_seg
-# Move rest of setup code/data to here
-	movw	$2048, %di			# four sectors loaded by LILO
-	subw	%si, %si
-	pushw	%cs
-	popw	%es
-	movw	$SYSSEG, %ax
-	movw	%ax, %ds
-	rep
-	movsw
-	movw	%cs, %ax			# aka SETUPSEG
-	movw	%ax, %ds
-	cmpw	$SIG1, setup_sig1
-	jne	no_sig
-
-	cmpw	$SIG2, setup_sig2
-	jne	no_sig
-
-	jmp	good_sig
-
-no_sig:
-	lea	no_sig_mess, %si
-	call	prtstr
-
-no_sig_loop:
-	hlt
-	jmp	no_sig_loop
-
-good_sig:
-	movw	%cs, %ax			# aka SETUPSEG
-	subw	$DELTA_INITSEG, %ax 		# aka INITSEG
-	movw	%ax, %ds
-# Check if an old loader tries to load a big-kernel
-	testb	$LOADED_HIGH, %cs:loadflags	# Do we have a big kernel?
-	jz	loader_ok			# No, no danger for old loaders.
-
-	cmpb	$0, %cs:type_of_loader 		# Do we have a loader that
-						# can deal with us?
-	jnz	loader_ok			# Yes, continue.
-
-	pushw	%cs				# No, we have an old loader,
-	popw	%ds				# die. 
-	lea	loader_panic_mess, %si
-	call	prtstr
-
-	jmp	no_sig_loop
-
-loader_panic_mess: .string "Wrong loader, giving up..."
-
-# check minimum cpuid
-# we do this here because it is the last place we can actually
-# show a user visible error message. Later the video modus
-# might be already messed up.
-loader_ok:
-	call verify_cpu
-	testl  %eax,%eax
-	jz	cpu_ok
-	movw	%cs,%ax		# aka SETUPSEG
-	movw	%ax,%ds
-	lea	cpu_panic_mess,%si
-	call	prtstr
-1:	jmp	1b
-
-cpu_panic_mess:
-	.asciz  "PANIC: CPU too old for this kernel."
-
-#include "../kernel/verify_cpu.S"
-
-cpu_ok:
-# Get memory size (extended mem, kB)
-
-	xorl	%eax, %eax
-	movl	%eax, (0x1e0)
-#ifndef STANDARD_MEMORY_BIOS_CALL
-	movb	%al, (E820NR)
-# Try three different memory detection schemes.  First, try
-# e820h, which lets us assemble a memory map, then try e801h,
-# which returns a 32-bit memory size, and finally 88h, which
-# returns 0-64m
-
-# method E820H:
-# the memory map from hell.  e820h returns memory classified into
-# a whole bunch of different types, and allows memory holes and
-# everything.  We scan through this memory map and build a list
-# of the first 32 memory areas, which we return at [E820MAP].
-# This is documented at http://www.acpi.info/, in the ACPI 2.0 specification.
-
-#define SMAP  0x534d4150
-
-meme820:
-	xorl	%ebx, %ebx			# continuation counter
-	movw	$E820MAP, %di			# point into the whitelist
-						# so we can have the bios
-						# directly write into it.
-
-jmpe820:
-	movl	$0x0000e820, %eax		# e820, upper word zeroed
-	movl	$SMAP, %edx			# ascii 'SMAP'
-	movl	$20, %ecx			# size of the e820rec
-	pushw	%ds				# data record.
-	popw	%es
-	int	$0x15				# make the call
-	jc	bail820				# fall to e801 if it fails
-
-	cmpl	$SMAP, %eax			# check the return is `SMAP'
-	jne	bail820				# fall to e801 if it fails
-
-#	cmpl	$1, 16(%di)			# is this usable memory?
-#	jne	again820
-
-	# If this is usable memory, we save it by simply advancing %di by
-	# sizeof(e820rec).
-	#
-good820:
-	movb	(E820NR), %al			# up to 128 entries
-	cmpb	$E820MAX, %al
-	jae	bail820
-
-	incb	(E820NR)
-	movw	%di, %ax
-	addw	$20, %ax
-	movw	%ax, %di
-again820:
-	cmpl	$0, %ebx			# check to see if
-	jne	jmpe820				# %ebx is set to EOF
-bail820:
-
-
-# method E801H:
-# memory size is in 1k chunksizes, to avoid confusing loadlin.
-# we store the 0xe801 memory size in a completely different place,
-# because it will most likely be longer than 16 bits.
-# (use 1e0 because that's what Larry Augustine uses in his
-# alternative new memory detection scheme, and it's sensible
-# to write everything into the same place.)
-
-meme801:
-	stc					# fix to work around buggy
-	xorw	%cx,%cx				# BIOSes which don't clear/set
-	xorw	%dx,%dx				# carry on pass/error of
-						# e801h memory size call
-						# or merely pass cx,dx though
-						# without changing them.
-	movw	$0xe801, %ax
-	int	$0x15
-	jc	mem88
-
-	cmpw	$0x0, %cx			# Kludge to handle BIOSes
-	jne	e801usecxdx			# which report their extended
-	cmpw	$0x0, %dx			# memory in AX/BX rather than
-	jne	e801usecxdx			# CX/DX.  The spec I have read
-	movw	%ax, %cx			# seems to indicate AX/BX 
-	movw	%bx, %dx			# are more reasonable anyway...
-
-e801usecxdx:
-	andl	$0xffff, %edx			# clear sign extend
-	shll	$6, %edx			# and go from 64k to 1k chunks
-	movl	%edx, (0x1e0)			# store extended memory size
-	andl	$0xffff, %ecx			# clear sign extend
- 	addl	%ecx, (0x1e0)			# and add lower memory into
-						# total size.
-
-# Ye Olde Traditional Methode.  Returns the memory size (up to 16mb or
-# 64mb, depending on the bios) in ax.
-mem88:
-
-#endif
-	movb	$0x88, %ah
-	int	$0x15
-	movw	%ax, (2)
-
-# Set the keyboard repeat rate to the max
-	movw	$0x0305, %ax
-	xorw	%bx, %bx
-	int	$0x16
-
-# Check for video adapter and its parameters and allow the
-# user to browse video modes.
-	call	video				# NOTE: we need %ds pointing
-						# to bootsector
-
-# Get hd0 data...
-	xorw	%ax, %ax
-	movw	%ax, %ds
-	ldsw	(4 * 0x41), %si
-	movw	%cs, %ax			# aka SETUPSEG
-	subw	$DELTA_INITSEG, %ax		# aka INITSEG
-	pushw	%ax
-	movw	%ax, %es
-	movw	$0x0080, %di
-	movw	$0x10, %cx
-	pushw	%cx
-	cld
-	rep
- 	movsb
-# Get hd1 data...
-	xorw	%ax, %ax
-	movw	%ax, %ds
-	ldsw	(4 * 0x46), %si
-	popw	%cx
-	popw	%es
-	movw	$0x0090, %di
-	rep
-	movsb
-# Check that there IS a hd1 :-)
-	movw	$0x01500, %ax
-	movb	$0x81, %dl
-	int	$0x13
-	jc	no_disk1
-	
-	cmpb	$3, %ah
-	je	is_disk1
-
-no_disk1:
-	movw	%cs, %ax			# aka SETUPSEG
-	subw	$DELTA_INITSEG, %ax 		# aka INITSEG
-	movw	%ax, %es
-	movw	$0x0090, %di
-	movw	$0x10, %cx
-	xorw	%ax, %ax
-	cld
-	rep
-	stosb
-is_disk1:
-# check for Micro Channel (MCA) bus
-	movw	%cs, %ax			# aka SETUPSEG
-	subw	$DELTA_INITSEG, %ax		# aka INITSEG
-	movw	%ax, %ds
-	xorw	%ax, %ax
-	movw	%ax, (0xa0)			# set table length to 0
-	movb	$0xc0, %ah
-	stc
-	int	$0x15				# moves feature table to es:bx
-	jc	no_mca
-
-	pushw	%ds
-	movw	%es, %ax
-	movw	%ax, %ds
-	movw	%cs, %ax			# aka SETUPSEG
-	subw	$DELTA_INITSEG, %ax		# aka INITSEG
-	movw	%ax, %es
-	movw	%bx, %si
-	movw	$0xa0, %di
-	movw	(%si), %cx
-	addw	$2, %cx				# table length is a short
-	cmpw	$0x10, %cx
-	jc	sysdesc_ok
-
-	movw	$0x10, %cx			# we keep only first 16 bytes
-sysdesc_ok:
-	rep
-	movsb
-	popw	%ds
-no_mca:
-#ifdef CONFIG_X86_VOYAGER
-	movb	$0xff, 0x40	# flag on config found
-	movb	$0xc0, %al
-	mov	$0xff, %ah
-	int	$0x15		# put voyager config info at es:di
-	jc	no_voyager
-	movw	$0x40, %si	# place voyager info in apm table
-	cld
-	movw	$7, %cx
-voyager_rep:
-	movb	%es:(%di), %al
-	movb	%al,(%si)
-	incw	%di
-	incw	%si
-	decw	%cx
-	jnz	voyager_rep
-no_voyager:	
-#endif
-# Check for PS/2 pointing device
-	movw	%cs, %ax			# aka SETUPSEG
-	subw	$DELTA_INITSEG, %ax		# aka INITSEG
-	movw	%ax, %ds
-	movb	$0, (0x1ff)			# default is no pointing device
-	int	$0x11				# int 0x11: equipment list
-	testb	$0x04, %al			# check if mouse installed
-	jz	no_psmouse
-
-	movb	$0xAA, (0x1ff)			# device present
-no_psmouse:
-
-#if defined(CONFIG_X86_SPEEDSTEP_SMI) || defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
-	movl	$0x0000E980, %eax		# IST Support 
-	movl	$0x47534943, %edx		# Request value
-	int	$0x15
-
-	movl	%eax, (96)
-	movl	%ebx, (100)
-	movl	%ecx, (104)
-	movl	%edx, (108)
-#endif
-
-#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE)
-# Then check for an APM BIOS...
-						# %ds points to the bootsector
-	movw	$0, 0x40			# version = 0 means no APM BIOS
-	movw	$0x05300, %ax			# APM BIOS installation check
-	xorw	%bx, %bx
-	int	$0x15
-	jc	done_apm_bios			# Nope, no APM BIOS
-	
-	cmpw	$0x0504d, %bx			# Check for "PM" signature
-	jne	done_apm_bios			# No signature, no APM BIOS
-
-	andw	$0x02, %cx			# Is 32 bit supported?
-	je	done_apm_bios			# No 32-bit, no (good) APM BIOS
-
-	movw	$0x05304, %ax			# Disconnect first just in case
-	xorw	%bx, %bx
-	int	$0x15				# ignore return code
-	movw	$0x05303, %ax			# 32 bit connect
-	xorl	%ebx, %ebx
-	xorw	%cx, %cx			# paranoia :-)
-	xorw	%dx, %dx			#   ...
-	xorl	%esi, %esi			#   ...
-	xorw	%di, %di			#   ...
-	int	$0x15
-	jc	no_32_apm_bios			# Ack, error. 
-
-	movw	%ax,  (66)			# BIOS code segment
-	movl	%ebx, (68)			# BIOS entry point offset
-	movw	%cx,  (72)			# BIOS 16 bit code segment
-	movw	%dx,  (74)			# BIOS data segment
-	movl	%esi, (78)			# BIOS code segment lengths
-	movw	%di,  (82)			# BIOS data segment length
-# Redo the installation check as the 32 bit connect
-# modifies the flags returned on some BIOSs
-	movw	$0x05300, %ax			# APM BIOS installation check
-	xorw	%bx, %bx
-	xorw	%cx, %cx			# paranoia
-	int	$0x15
-	jc	apm_disconnect			# error -> shouldn't happen
-
-	cmpw	$0x0504d, %bx			# check for "PM" signature
-	jne	apm_disconnect			# no sig -> shouldn't happen
-
-	movw	%ax, (64)			# record the APM BIOS version
-	movw	%cx, (76)			# and flags
-	jmp	done_apm_bios
-
-apm_disconnect:					# Tidy up
-	movw	$0x05304, %ax			# Disconnect
-	xorw	%bx, %bx
-	int	$0x15				# ignore return code
-
-	jmp	done_apm_bios
-
-no_32_apm_bios:
-	andw	$0xfffd, (76)			# remove 32 bit support bit
-done_apm_bios:
-#endif
-
-#include "edd.S"
-
-# Now we want to move to protected mode ...
-	cmpw	$0, %cs:realmode_swtch
-	jz	rmodeswtch_normal
-
-	lcall	*%cs:realmode_swtch
-
-	jmp	rmodeswtch_end
-
-rmodeswtch_normal:
-        pushw	%cs
-	call	default_switch
-
-rmodeswtch_end:
-# Now we move the system to its rightful place ... but we check if we have a
-# big-kernel. In that case we *must* not move it ...
-	testb	$LOADED_HIGH, %cs:loadflags
-	jz	do_move0			# .. then we have a normal low
-						# loaded zImage
-						# .. or else we have a high
-						# loaded bzImage
-	jmp	end_move			# ... and we skip moving
-
-do_move0:
-	movw	$0x100, %ax			# start of destination segment
-	movw	%cs, %bp			# aka SETUPSEG
-	subw	$DELTA_INITSEG, %bp		# aka INITSEG
-	movw	%cs:start_sys_seg, %bx		# start of source segment
-	cld
-do_move:
-	movw	%ax, %es			# destination segment
-	incb	%ah				# instead of add ax,#0x100
-	movw	%bx, %ds			# source segment
-	addw	$0x100, %bx
-	subw	%di, %di
-	subw	%si, %si
-	movw 	$0x800, %cx
-	rep
-	movsw
-	cmpw	%bp, %bx			# assume start_sys_seg > 0x200,
-						# so we will perhaps read one
-						# page more than needed, but
-						# never overwrite INITSEG
-						# because destination is a
-						# minimum one page below source
-	jb	do_move
-
-end_move:
-# then we load the segment descriptors
-	movw	%cs, %ax			# aka SETUPSEG
-	movw	%ax, %ds
-		
-# Check whether we need to be downward compatible with version <=201
-	cmpl	$0, cmd_line_ptr
-	jne	end_move_self		# loader uses version >=202 features
-	cmpb	$0x20, type_of_loader
-	je	end_move_self		# bootsect loader, we know of it
-
-# Boot loader doesnt support boot protocol version 2.02.
-# If we have our code not at 0x90000, we need to move it there now.
-# We also then need to move the params behind it (commandline)
-# Because we would overwrite the code on the current IP, we move
-# it in two steps, jumping high after the first one.
-	movw	%cs, %ax
-	cmpw	$SETUPSEG, %ax
-	je	end_move_self
-
-	cli					# make sure we really have
-						# interrupts disabled !
-						# because after this the stack
-						# should not be used
-	subw	$DELTA_INITSEG, %ax		# aka INITSEG
-	movw	%ss, %dx
-	cmpw	%ax, %dx
-	jb	move_self_1
-
-	addw	$INITSEG, %dx
-	subw	%ax, %dx			# this will go into %ss after
-						# the move
-move_self_1:
-	movw	%ax, %ds
-	movw	$INITSEG, %ax			# real INITSEG
-	movw	%ax, %es
-	movw	%cs:setup_move_size, %cx
-	std					# we have to move up, so we use
-						# direction down because the
-						# areas may overlap
-	movw	%cx, %di
-	decw	%di
-	movw	%di, %si
-	subw	$move_self_here+0x200, %cx
-	rep
-	movsb
-	ljmp	$SETUPSEG, $move_self_here
-
-move_self_here:
-	movw	$move_self_here+0x200, %cx
-	rep
-	movsb
-	movw	$SETUPSEG, %ax
-	movw	%ax, %ds
-	movw	%dx, %ss
-end_move_self:					# now we are at the right place
-
-#
-# Enable A20.  This is at the very best an annoying procedure.
-# A20 code ported from SYSLINUX 1.52-1.63 by H. Peter Anvin.
-# AMD Elan bug fix by Robert Schwebel.
-#
-
-#if defined(CONFIG_X86_ELAN)
-	movb $0x02, %al			# alternate A20 gate
-	outb %al, $0x92			# this works on SC410/SC520
-a20_elan_wait:
-	call a20_test
-	jz a20_elan_wait
-	jmp a20_done
-#endif
-
-
-A20_TEST_LOOPS		=  32		# Iterations per wait
-A20_ENABLE_LOOPS	= 255		# Total loops to try		
-
-
-#ifndef CONFIG_X86_VOYAGER
-a20_try_loop:
-
-	# First, see if we are on a system with no A20 gate.
-a20_none:
-	call	a20_test
-	jnz	a20_done
-
-	# Next, try the BIOS (INT 0x15, AX=0x2401)
-a20_bios:
-	movw	$0x2401, %ax
-	pushfl					# Be paranoid about flags
-	int	$0x15
-	popfl
-
-	call	a20_test
-	jnz	a20_done
-
-	# Try enabling A20 through the keyboard controller
-#endif /* CONFIG_X86_VOYAGER */
-a20_kbc:
-	call	empty_8042
-
-#ifndef CONFIG_X86_VOYAGER
-	call	a20_test			# Just in case the BIOS worked
-	jnz	a20_done			# but had a delayed reaction.
-#endif
-
-	movb	$0xD1, %al			# command write
-	outb	%al, $0x64
-	call	empty_8042
-
-	movb	$0xDF, %al			# A20 on
-	outb	%al, $0x60
-	call	empty_8042
-
-#ifndef CONFIG_X86_VOYAGER
-	# Wait until a20 really *is* enabled; it can take a fair amount of
-	# time on certain systems; Toshiba Tecras are known to have this
-	# problem.
-a20_kbc_wait:
-	xorw	%cx, %cx
-a20_kbc_wait_loop:
-	call	a20_test
-	jnz	a20_done
-	loop	a20_kbc_wait_loop
-
-	# Final attempt: use "configuration port A"
-a20_fast:
-	inb	$0x92, %al			# Configuration Port A
-	orb	$0x02, %al			# "fast A20" version
-	andb	$0xFE, %al			# don't accidentally reset
-	outb	%al, $0x92
-
-	# Wait for configuration port A to take effect
-a20_fast_wait:
-	xorw	%cx, %cx
-a20_fast_wait_loop:
-	call	a20_test
-	jnz	a20_done
-	loop	a20_fast_wait_loop
-
-	# A20 is still not responding.  Try frobbing it again.
-	# 
-	decb	(a20_tries)
-	jnz	a20_try_loop
-	
-	movw	$a20_err_msg, %si
-	call	prtstr
-
-a20_die:
-	hlt
-	jmp	a20_die
-
-a20_tries:
-	.byte	A20_ENABLE_LOOPS
-
-a20_err_msg:
-	.ascii	"linux: fatal error: A20 gate not responding!"
-	.byte	13, 10, 0
-
-	# If we get here, all is good
-a20_done:
-
-#endif /* CONFIG_X86_VOYAGER */
-# set up gdt and idt and 32bit start address
-	lidt	idt_48				# load idt with 0,0
-	xorl	%eax, %eax			# Compute gdt_base
-	movw	%ds, %ax			# (Convert %ds:gdt to a linear ptr)
-	shll	$4, %eax
-	addl	%eax, code32
-	addl	$gdt, %eax
-	movl	%eax, (gdt_48+2)
-	lgdt	gdt_48				# load gdt with whatever is
-						# appropriate
-
-# make sure any possible coprocessor is properly reset..
-	xorw	%ax, %ax
-	outb	%al, $0xf0
-	call	delay
-
-	outb	%al, $0xf1
-	call	delay
-
-# well, that went ok, I hope. Now we mask all interrupts - the rest
-# is done in init_IRQ().
-	movb	$0xFF, %al			# mask all interrupts for now
-	outb	%al, $0xA1
-	call	delay
-	
-	movb	$0xFB, %al			# mask all irq's but irq2 which
-	outb	%al, $0x21			# is cascaded
-
-# Well, that certainly wasn't fun :-(. Hopefully it works, and we don't
-# need no steenking BIOS anyway (except for the initial loading :-).
-# The BIOS-routine wants lots of unnecessary data, and it's less
-# "interesting" anyway. This is how REAL programmers do it.
-#
-# Well, now's the time to actually move into protected mode. To make
-# things as simple as possible, we do no register set-up or anything,
-# we let the gnu-compiled 32-bit programs do that. We just jump to
-# absolute address 0x1000 (or the loader supplied one),
-# in 32-bit protected mode.
-#
-# Note that the short jump isn't strictly needed, although there are
-# reasons why it might be a good idea. It won't hurt in any case.
-	movw	$1, %ax				# protected mode (PE) bit
-	lmsw	%ax				# This is it!
-	jmp	flush_instr
-
-flush_instr:
-	xorw	%bx, %bx			# Flag to indicate a boot
-	xorl	%esi, %esi			# Pointer to real-mode code
-	movw	%cs, %si
-	subw	$DELTA_INITSEG, %si
-	shll	$4, %esi			# Convert to 32-bit pointer
-
-# jump to startup_32 in arch/i386/boot/compressed/head.S
-#	
-# NOTE: For high loaded big kernels we need a
-#	jmpi    0x100000,__BOOT_CS
-#
-#	but we yet haven't reloaded the CS register, so the default size 
-#	of the target offset still is 16 bit.
-#	However, using an operand prefix (0x66), the CPU will properly
-#	take our 48 bit far pointer. (INTeL 80386 Programmer's Reference
-#	Manual, Mixing 16-bit and 32-bit code, page 16-6)
-
-	.byte 0x66, 0xea			# prefix + jmpi-opcode
-code32:	.long	startup_32			# will be set to %cs+startup_32
-	.word	__BOOT_CS
-.code32
-startup_32:
-	movl $(__BOOT_DS), %eax
-	movl %eax, %ds
-	movl %eax, %es
-	movl %eax, %fs
-	movl %eax, %gs
-	movl %eax, %ss
-
-	xorl %eax, %eax
-1:	incl %eax				# check that A20 really IS enabled
-	movl %eax, 0x00000000			# loop forever if it isn't
-	cmpl %eax, 0x00100000
-	je 1b
-
-	# Jump to the 32bit entry point
-	jmpl *(code32_start - start + (DELTA_INITSEG << 4))(%esi)
-.code16
-
-# Here's a bunch of information about your current kernel..
-kernel_version:	.ascii	UTS_RELEASE
-		.ascii	" ("
-		.ascii	LINUX_COMPILE_BY
-		.ascii	"@"
-		.ascii	LINUX_COMPILE_HOST
-		.ascii	") "
-		.ascii	UTS_VERSION
-		.byte	0
-
-# This is the default real mode switch routine.
-# to be called just before protected mode transition
-default_switch:
-	cli					# no interrupts allowed !
-	movb	$0x80, %al			# disable NMI for bootup
-						# sequence
-	outb	%al, $0x70
-	lret
-
-
-#ifndef CONFIG_X86_VOYAGER
-# This routine tests whether or not A20 is enabled.  If so, it
-# exits with zf = 0.
-#
-# The memory address used, 0x200, is the int $0x80 vector, which
-# should be safe.
-
-A20_TEST_ADDR = 4*0x80
-
-a20_test:
-	pushw	%cx
-	pushw	%ax
-	xorw	%cx, %cx
-	movw	%cx, %fs			# Low memory
-	decw	%cx
-	movw	%cx, %gs			# High memory area
-	movw	$A20_TEST_LOOPS, %cx
-	movw	%fs:(A20_TEST_ADDR), %ax
-	pushw	%ax
-a20_test_wait:
-	incw	%ax
-	movw	%ax, %fs:(A20_TEST_ADDR)
-	call	delay				# Serialize and make delay constant
-	cmpw	%gs:(A20_TEST_ADDR+0x10), %ax
-	loope	a20_test_wait
-
-	popw	%fs:(A20_TEST_ADDR)
-	popw	%ax
-	popw	%cx
-	ret	
-
-#endif /* CONFIG_X86_VOYAGER */
-
-# This routine checks that the keyboard command queue is empty
-# (after emptying the output buffers)
-#
-# Some machines have delusions that the keyboard buffer is always full
-# with no keyboard attached...
-#
-# If there is no keyboard controller, we will usually get 0xff
-# to all the reads.  With each IO taking a microsecond and
-# a timeout of 100,000 iterations, this can take about half a
-# second ("delay" == outb to port 0x80). That should be ok,
-# and should also be plenty of time for a real keyboard controller
-# to empty.
-#
-
-empty_8042:
-	pushl	%ecx
-	movl	$100000, %ecx
-
-empty_8042_loop:
-	decl	%ecx
-	jz	empty_8042_end_loop
-
-	call	delay
-
-	inb	$0x64, %al			# 8042 status port
-	testb	$1, %al				# output buffer?
-	jz	no_output
-
-	call	delay
-	inb	$0x60, %al			# read it
-	jmp	empty_8042_loop
-
-no_output:
-	testb	$2, %al				# is input buffer full?
-	jnz	empty_8042_loop			# yes - loop
-empty_8042_end_loop:
-	popl	%ecx
-	ret
-
-# Read the cmos clock. Return the seconds in al
-gettime:
-	pushw	%cx
-	movb	$0x02, %ah
-	int	$0x1a
-	movb	%dh, %al			# %dh contains the seconds
-	andb	$0x0f, %al
-	movb	%dh, %ah
-	movb	$0x04, %cl
-	shrb	%cl, %ah
-	aad
-	popw	%cx
-	ret
-
-# Delay is needed after doing I/O
-delay:
-	outb	%al,$0x80
-	ret
-
-# Descriptor tables
-#
-# NOTE: The intel manual says gdt should be sixteen bytes aligned for
-# efficiency reasons.  However, there are machines which are known not
-# to boot with misaligned GDTs, so alter this at your peril!  If you alter
-# GDT_ENTRY_BOOT_CS (in asm/segment.h) remember to leave at least two
-# empty GDT entries (one for NULL and one reserved).
-#
-# NOTE:	On some CPUs, the GDT must be 8 byte aligned.  This is
-# true for the Voyager Quad CPU card which will not boot without
-# This directive.  16 byte aligment is recommended by intel.
-#
-	.align 16
-gdt:
-	.fill GDT_ENTRY_BOOT_CS,8,0
-
-	.word	0xFFFF				# 4Gb - (0x100000*0x1000 = 4Gb)
-	.word	0				# base address = 0
-	.word	0x9A00				# code read/exec
-	.word	0x00CF				# granularity = 4096, 386
-						#  (+5th nibble of limit)
-
-	.word	0xFFFF				# 4Gb - (0x100000*0x1000 = 4Gb)
-	.word	0				# base address = 0
-	.word	0x9200				# data read/write
-	.word	0x00CF				# granularity = 4096, 386
-						#  (+5th nibble of limit)
-gdt_end:
-	.align	4
-	
-	.word	0				# alignment byte
-idt_48:
-	.word	0				# idt limit = 0
-	.word	0, 0				# idt base = 0L
-
-	.word	0				# alignment byte
-gdt_48:
-	.word	gdt_end - gdt - 1		# gdt limit
-	.word	0, 0				# gdt base (filled in later)
-
-# Include video setup & detection code
-
-#include "video.S"
-
-# Setup signature -- must be last
-setup_sig1:	.word	SIG1
-setup_sig2:	.word	SIG2
-
-# After this point, there is some free space which is used by the video mode
-# handling code to store the temporary mode table (not used by the kernel).
-
-modelist:
-
-.text
-endtext:
-.data
-enddata:
-.bss
-endbss:
diff --git a/arch/i386/boot/video.S b/arch/i386/boot/video.S
deleted file mode 100644
index 8143c9516cb4..000000000000
--- a/arch/i386/boot/video.S
+++ /dev/null
@@ -1,2043 +0,0 @@
-/*	video.S
- *
- *	Display adapter & video mode setup, version 2.13 (14-May-99)
- *
- *	Copyright (C) 1995 -- 1998 Martin Mares <mj@ucw.cz>
- *	Based on the original setup.S code (C) Linus Torvalds and Mats Anderson
- *
- *	Rewritten to use GNU 'as' by Chris Noe <stiker@northlink.com> May 1999
- *
- *	For further information, look at Documentation/svga.txt.
- *
- */
-
-/* Enable autodetection of SVGA adapters and modes. */
-#undef CONFIG_VIDEO_SVGA
-
-/* Enable autodetection of VESA modes */
-#define CONFIG_VIDEO_VESA
-
-/* Enable compacting of mode table */
-#define CONFIG_VIDEO_COMPACT
-
-/* Retain screen contents when switching modes */
-#define CONFIG_VIDEO_RETAIN
-
-/* Enable local mode list */
-#undef CONFIG_VIDEO_LOCAL
-
-/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */
-#undef CONFIG_VIDEO_400_HACK
-
-/* Hack that lets you force specific BIOS mode ID and specific dimensions */
-#undef CONFIG_VIDEO_GFX_HACK
-#define VIDEO_GFX_BIOS_AX 0x4f02	/* 800x600 on ThinkPad */
-#define VIDEO_GFX_BIOS_BX 0x0102
-#define VIDEO_GFX_DUMMY_RESOLUTION 0x6425	/* 100x37 */
-
-/* This code uses an extended set of video mode numbers. These include:
- * Aliases for standard modes
- *	NORMAL_VGA (-1)
- *	EXTENDED_VGA (-2)
- *	ASK_VGA (-3)
- * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
- * of compatibility when extending the table. These are between 0x00 and 0xff.
- */
-#define VIDEO_FIRST_MENU 0x0000
-
-/* Standard BIOS video modes (BIOS number + 0x0100) */
-#define VIDEO_FIRST_BIOS 0x0100
-
-/* VESA BIOS video modes (VESA number + 0x0200) */
-#define VIDEO_FIRST_VESA 0x0200
-
-/* Video7 special modes (BIOS number + 0x0900) */
-#define VIDEO_FIRST_V7 0x0900
-
-/* Special video modes */
-#define VIDEO_FIRST_SPECIAL 0x0f00
-#define VIDEO_80x25 0x0f00
-#define VIDEO_8POINT 0x0f01
-#define VIDEO_80x43 0x0f02
-#define VIDEO_80x28 0x0f03
-#define VIDEO_CURRENT_MODE 0x0f04
-#define VIDEO_80x30 0x0f05
-#define VIDEO_80x34 0x0f06
-#define VIDEO_80x60 0x0f07
-#define VIDEO_GFX_HACK 0x0f08
-#define VIDEO_LAST_SPECIAL 0x0f09
-
-/* Video modes given by resolution */
-#define VIDEO_FIRST_RESOLUTION 0x1000
-
-/* The "recalculate timings" flag */
-#define VIDEO_RECALC 0x8000
-
-/* Positions of various video parameters passed to the kernel */
-/* (see also include/linux/tty.h) */
-#define PARAM_CURSOR_POS	0x00
-#define PARAM_VIDEO_PAGE	0x04
-#define PARAM_VIDEO_MODE	0x06
-#define PARAM_VIDEO_COLS	0x07
-#define PARAM_VIDEO_EGA_BX	0x0a
-#define PARAM_VIDEO_LINES	0x0e
-#define PARAM_HAVE_VGA		0x0f
-#define PARAM_FONT_POINTS	0x10
-
-#define PARAM_LFB_WIDTH		0x12
-#define PARAM_LFB_HEIGHT	0x14
-#define PARAM_LFB_DEPTH		0x16
-#define PARAM_LFB_BASE		0x18
-#define PARAM_LFB_SIZE		0x1c
-#define PARAM_LFB_LINELENGTH	0x24
-#define PARAM_LFB_COLORS	0x26
-#define PARAM_VESAPM_SEG	0x2e
-#define PARAM_VESAPM_OFF	0x30
-#define PARAM_LFB_PAGES		0x32
-#define PARAM_VESA_ATTRIB	0x34
-#define PARAM_CAPABILITIES	0x36
-
-/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */
-#ifdef CONFIG_VIDEO_RETAIN
-#define DO_STORE call store_screen
-#else
-#define DO_STORE
-#endif /* CONFIG_VIDEO_RETAIN */
-
-# This is the main entry point called by setup.S
-# %ds *must* be pointing to the bootsector
-video:	pushw	%ds		# We use different segments
-	pushw	%ds		# FS contains original DS
-	popw	%fs
-	pushw	%cs		# DS is equal to CS
-	popw	%ds
-	pushw	%cs		# ES is equal to CS
-	popw	%es
-	xorw	%ax, %ax
-	movw	%ax, %gs	# GS is zero
-	cld
-	call	basic_detect	# Basic adapter type testing (EGA/VGA/MDA/CGA)
-#ifdef CONFIG_VIDEO_SELECT
-	movw	%fs:(0x01fa), %ax		# User selected video mode
-	cmpw	$ASK_VGA, %ax			# Bring up the menu
-	jz	vid2
-
-	call	mode_set			# Set the mode
-	jc	vid1
-
-	leaw	badmdt, %si			# Invalid mode ID
-	call	prtstr
-vid2:	call	mode_menu
-vid1:
-#ifdef CONFIG_VIDEO_RETAIN
-	call	restore_screen			# Restore screen contents
-#endif /* CONFIG_VIDEO_RETAIN */
-	call	store_edid
-#endif /* CONFIG_VIDEO_SELECT */
-	call	mode_params			# Store mode parameters
-	popw	%ds				# Restore original DS
-	ret
-
-# Detect if we have CGA, MDA, EGA or VGA and pass it to the kernel.
-basic_detect:
-	movb	$0, %fs:(PARAM_HAVE_VGA)
-	movb	$0x12, %ah	# Check EGA/VGA
-	movb	$0x10, %bl
-	int	$0x10
-	movw	%bx, %fs:(PARAM_VIDEO_EGA_BX)	# Identifies EGA to the kernel
-	cmpb	$0x10, %bl			# No, it's a CGA/MDA/HGA card.
-	je	basret
-
-	incb	adapter
-	movw	$0x1a00, %ax			# Check EGA or VGA?
-	int	$0x10
-	cmpb	$0x1a, %al			# 1a means VGA...
-	jne	basret				# anything else is EGA.
-	
-	incb	%fs:(PARAM_HAVE_VGA)		# We've detected a VGA
-	incb	adapter
-basret:	ret
-
-# Store the video mode parameters for later usage by the kernel.
-# This is done by asking the BIOS except for the rows/columns
-# parameters in the default 80x25 mode -- these are set directly,
-# because some very obscure BIOSes supply insane values.
-mode_params:
-#ifdef CONFIG_VIDEO_SELECT
-	cmpb	$0, graphic_mode
-	jnz	mopar_gr
-#endif
-	movb	$0x03, %ah			# Read cursor position
-	xorb	%bh, %bh
-	int	$0x10
-	movw	%dx, %fs:(PARAM_CURSOR_POS)
-	movb	$0x0f, %ah			# Read page/mode/width
-	int	$0x10
-	movw	%bx, %fs:(PARAM_VIDEO_PAGE)
-	movw	%ax, %fs:(PARAM_VIDEO_MODE)	# Video mode and screen width
-	cmpb	$0x7, %al			# MDA/HGA => segment differs
-	jnz	mopar0
-
-	movw	$0xb000, video_segment
-mopar0: movw	%gs:(0x485), %ax		# Font size
-	movw	%ax, %fs:(PARAM_FONT_POINTS)	# (valid only on EGA/VGA)
-	movw	force_size, %ax			# Forced size?
-	orw	%ax, %ax
-	jz	mopar1
-
-	movb	%ah, %fs:(PARAM_VIDEO_COLS)
-	movb	%al, %fs:(PARAM_VIDEO_LINES)
-	ret
-
-mopar1:	movb	$25, %al
-	cmpb	$0, adapter			# If we are on CGA/MDA/HGA, the
-	jz	mopar2				# screen must have 25 lines.
-
-	movb	%gs:(0x484), %al		# On EGA/VGA, use the EGA+ BIOS
-	incb	%al				# location of max lines.
-mopar2: movb	%al, %fs:(PARAM_VIDEO_LINES)
-	ret
-
-#ifdef CONFIG_VIDEO_SELECT
-# Fetching of VESA frame buffer parameters
-mopar_gr:
-	leaw	modelist+1024, %di
-	movb	$0x23, %fs:(PARAM_HAVE_VGA)
-	movw	16(%di), %ax
-	movw	%ax, %fs:(PARAM_LFB_LINELENGTH)
-	movw	18(%di), %ax
-	movw	%ax, %fs:(PARAM_LFB_WIDTH)
-	movw	20(%di), %ax
-	movw	%ax, %fs:(PARAM_LFB_HEIGHT)
-	movb	25(%di), %al
-	movb	$0, %ah
-	movw	%ax, %fs:(PARAM_LFB_DEPTH)
-	movb	29(%di), %al	
-	movb	$0, %ah
-	movw	%ax, %fs:(PARAM_LFB_PAGES)
-	movl	40(%di), %eax
-	movl	%eax, %fs:(PARAM_LFB_BASE)
-	movl	31(%di), %eax
-	movl	%eax, %fs:(PARAM_LFB_COLORS)
-	movl	35(%di), %eax
-	movl	%eax, %fs:(PARAM_LFB_COLORS+4)
-	movw	0(%di), %ax
-	movw	%ax, %fs:(PARAM_VESA_ATTRIB)
-
-# get video mem size
-	leaw	modelist+1024, %di
-	movw	$0x4f00, %ax
-	int	$0x10
-	xorl	%eax, %eax
-	movw	18(%di), %ax
-	movl	%eax, %fs:(PARAM_LFB_SIZE)
-
-# store mode capabilities
-	movl 10(%di), %eax
-	movl %eax, %fs:(PARAM_CAPABILITIES)
-
-# switching the DAC to 8-bit is for <= 8 bpp only
-	movw	%fs:(PARAM_LFB_DEPTH), %ax
-	cmpw	$8, %ax
-	jg	dac_done
-
-# get DAC switching capability
-	xorl	%eax, %eax
-	movb	10(%di), %al
-	testb	$1, %al
-	jz	dac_set
-
-# attempt to switch DAC to 8-bit
-	movw	$0x4f08, %ax
-	movw	$0x0800, %bx
-	int	$0x10
-	cmpw	$0x004f, %ax
-	jne     dac_set
-	movb    %bh, dac_size		# store actual DAC size
-
-dac_set:
-# set color size to DAC size
-	movb	dac_size, %al
-	movb	%al, %fs:(PARAM_LFB_COLORS+0)
-	movb	%al, %fs:(PARAM_LFB_COLORS+2)
-	movb	%al, %fs:(PARAM_LFB_COLORS+4)
-	movb	%al, %fs:(PARAM_LFB_COLORS+6)
-
-# set color offsets to 0
-	movb	$0, %fs:(PARAM_LFB_COLORS+1)
-	movb	$0, %fs:(PARAM_LFB_COLORS+3)
-	movb	$0, %fs:(PARAM_LFB_COLORS+5)
-	movb	$0, %fs:(PARAM_LFB_COLORS+7)
-
-dac_done:
-# get protected mode interface informations
-	movw	$0x4f0a, %ax
-	xorw	%bx, %bx
-	xorw	%di, %di
-	int	$0x10
-	cmp	$0x004f, %ax
-	jnz	no_pm
-
-	movw	%es, %fs:(PARAM_VESAPM_SEG)
-	movw	%di, %fs:(PARAM_VESAPM_OFF)
-no_pm:	ret
-
-# The video mode menu
-mode_menu:
-	leaw	keymsg, %si			# "Return/Space/Timeout" message
-	call	prtstr
-	call	flush
-nokey:	call	getkt
-
-	cmpb	$0x0d, %al			# ENTER ?
-	je	listm				# yes - manual mode selection
-
-	cmpb	$0x20, %al			# SPACE ?
-	je	defmd1				# no - repeat
-
-	call 	beep
-	jmp	nokey
-
-defmd1:	ret					# No mode chosen? Default 80x25
-
-listm:	call	mode_table			# List mode table
-listm0:	leaw	name_bann, %si			# Print adapter name
-	call	prtstr
-	movw	card_name, %si
-	orw	%si, %si
-	jnz	an2
-
-	movb	adapter, %al
-	leaw	old_name, %si
-	orb	%al, %al
-	jz	an1
-
-	leaw	ega_name, %si
-	decb	%al
-	jz	an1
-
-	leaw	vga_name, %si
-	jmp	an1
-
-an2:	call	prtstr
-	leaw	svga_name, %si
-an1:	call	prtstr
-	leaw	listhdr, %si			# Table header
-	call	prtstr
-	movb	$0x30, %dl			# DL holds mode number
-	leaw	modelist, %si
-lm1:	cmpw	$ASK_VGA, (%si)			# End?
-	jz	lm2
-
-	movb	%dl, %al			# Menu selection number
-	call	prtchr
-	call	prtsp2
-	lodsw
-	call	prthw				# Mode ID
-	call	prtsp2
-	movb	0x1(%si), %al
-	call	prtdec				# Rows
-	movb	$0x78, %al			# the letter 'x'
-	call	prtchr
-	lodsw
-	call	prtdec				# Columns
-	movb	$0x0d, %al			# New line
-	call	prtchr
-	movb	$0x0a, %al
-	call	prtchr
-	incb	%dl				# Next character
-	cmpb	$0x3a, %dl
-	jnz	lm1
-
-	movb	$0x61, %dl
-	jmp	lm1
-
-lm2:	leaw	prompt, %si			# Mode prompt
-	call	prtstr
-	leaw	edit_buf, %di			# Editor buffer
-lm3:	call	getkey
-	cmpb	$0x0d, %al			# Enter?
-	jz	lment
-
-	cmpb	$0x08, %al			# Backspace?
-	jz	lmbs
-
-	cmpb	$0x20, %al			# Printable?
-	jc	lm3
-
-	cmpw	$edit_buf+4, %di		# Enough space?
-	jz	lm3
-
-	stosb
-	call	prtchr
-	jmp	lm3
-
-lmbs:	cmpw	$edit_buf, %di			# Backspace
-	jz	lm3
-
-	decw	%di
-	movb	$0x08, %al
-	call	prtchr
-	call	prtspc
-	movb	$0x08, %al
-	call	prtchr
-	jmp	lm3
-	
-lment:	movb	$0, (%di)
-	leaw	crlft, %si
-	call	prtstr
-	leaw	edit_buf, %si
-	cmpb	$0, (%si)			# Empty string = default mode
-	jz	lmdef
-
-	cmpb	$0, 1(%si)			# One character = menu selection
-	jz	mnusel
-
-	cmpw	$0x6373, (%si)			# "scan" => mode scanning
-	jnz	lmhx
-
-	cmpw	$0x6e61, 2(%si)
-	jz	lmscan
-
-lmhx:	xorw	%bx, %bx			# Else => mode ID in hex
-lmhex:	lodsb
-	orb	%al, %al
-	jz	lmuse1
-
-	subb	$0x30, %al
-	jc	lmbad
-
-	cmpb	$10, %al
-	jc	lmhx1
-
-	subb	$7, %al
-	andb	$0xdf, %al
-	cmpb	$10, %al
-	jc	lmbad
-
-	cmpb	$16, %al
-	jnc	lmbad
-
-lmhx1:	shlw	$4, %bx
-	orb	%al, %bl
-	jmp	lmhex
-
-lmuse1:	movw	%bx, %ax
-	jmp	lmuse
-
-mnusel:	lodsb					# Menu selection
-	xorb	%ah, %ah
-	subb	$0x30, %al
-	jc	lmbad
-
-	cmpb	$10, %al
-	jc	lmuse
-	
-	cmpb	$0x61-0x30, %al
-	jc	lmbad
-	
-	subb	$0x61-0x30-10, %al
-	cmpb	$36, %al
-	jnc	lmbad
-
-lmuse:	call	mode_set
-	jc	lmdef
-
-lmbad:	leaw	unknt, %si
-	call	prtstr
-	jmp	lm2
-lmscan:	cmpb	$0, adapter			# Scanning only on EGA/VGA
-	jz	lmbad
-
-	movw	$0, mt_end			# Scanning of modes is
-	movb	$1, scanning			# done as new autodetection.
-	call	mode_table
-	jmp	listm0
-lmdef:	ret
-
-# Additional parts of mode_set... (relative jumps, you know)
-setv7:						# Video7 extended modes
-	DO_STORE
-	subb	$VIDEO_FIRST_V7>>8, %bh
-	movw	$0x6f05, %ax
-	int	$0x10
-	stc
-	ret
-
-_setrec:	jmp	setrec			# Ugly...
-_set_80x25:	jmp	set_80x25
-
-# Aliases for backward compatibility.
-setalias:
-	movw	$VIDEO_80x25, %ax
-	incw	%bx
-	jz	mode_set
-
-	movb	$VIDEO_8POINT-VIDEO_FIRST_SPECIAL, %al
-	incw	%bx
-	jnz	setbad				# Fall-through!
-
-# Setting of user mode (AX=mode ID) => CF=success
-mode_set:
-	movw	%ax, %fs:(0x01fa)		# Store mode for use in acpi_wakeup.S
-	movw	%ax, %bx
-	cmpb	$0xff, %ah
-	jz	setalias
-
-	testb	$VIDEO_RECALC>>8, %ah
-	jnz	_setrec
-
-	cmpb	$VIDEO_FIRST_RESOLUTION>>8, %ah
-	jnc	setres
-	
-	cmpb	$VIDEO_FIRST_SPECIAL>>8, %ah
-	jz	setspc
-	
-	cmpb	$VIDEO_FIRST_V7>>8, %ah
-	jz	setv7
-	
-	cmpb	$VIDEO_FIRST_VESA>>8, %ah
-	jnc	check_vesa
-	
-	orb	%ah, %ah
-	jz	setmenu
-	
-	decb	%ah
-	jz	setbios
-
-setbad:	clc
-	movb	$0, do_restore			# The screen needn't be restored
-	ret
-
-setvesa:
-	DO_STORE
-	subb	$VIDEO_FIRST_VESA>>8, %bh
-	movw	$0x4f02, %ax			# VESA BIOS mode set call
-	int	$0x10
-	cmpw	$0x004f, %ax			# AL=4f if implemented
-	jnz	setbad				# AH=0 if OK
-
-	stc
-	ret
-
-setbios:
-	DO_STORE
-	int	$0x10				# Standard BIOS mode set call
-	pushw	%bx
-	movb	$0x0f, %ah			# Check if really set
-	int	$0x10
-	popw	%bx
-	cmpb	%bl, %al
-	jnz	setbad
-	
-	stc
-	ret
-
-setspc:	xorb	%bh, %bh			# Set special mode
-	cmpb	$VIDEO_LAST_SPECIAL-VIDEO_FIRST_SPECIAL, %bl
-	jnc	setbad
-	
-	addw	%bx, %bx
-	jmp	*spec_inits(%bx)
-
-setmenu:
-	orb	%al, %al			# 80x25 is an exception
-	jz	_set_80x25
-	
-	pushw	%bx				# Set mode chosen from menu
-	call	mode_table			# Build the mode table
-	popw	%ax
-	shlw	$2, %ax
-	addw	%ax, %si
-	cmpw	%di, %si
-	jnc	setbad
-	
-	movw	(%si), %ax			# Fetch mode ID
-_m_s:	jmp	mode_set
-
-setres:	pushw	%bx				# Set mode chosen by resolution
-	call	mode_table
-	popw	%bx
-	xchgb	%bl, %bh
-setr1:	lodsw
-	cmpw	$ASK_VGA, %ax			# End of the list?
-	jz	setbad
-	
-	lodsw
-	cmpw	%bx, %ax
-	jnz	setr1
-	
-	movw	-4(%si), %ax			# Fetch mode ID
-	jmp	_m_s
-
-check_vesa:
-#ifdef CONFIG_FIRMWARE_EDID
-	leaw	modelist+1024, %di
-	movw	$0x4f00, %ax
-	int	$0x10
-	cmpw	$0x004f, %ax
-	jnz	setbad
-
-	movw	4(%di), %ax
-	movw	%ax, vbe_version
-#endif
-	leaw	modelist+1024, %di
-	subb	$VIDEO_FIRST_VESA>>8, %bh
-	movw	%bx, %cx			# Get mode information structure
-	movw	$0x4f01, %ax
-	int	$0x10
-	addb	$VIDEO_FIRST_VESA>>8, %bh
-	cmpw	$0x004f, %ax
-	jnz	setbad
-
-	movb	(%di), %al			# Check capabilities.
-	andb	$0x19, %al
-	cmpb	$0x09, %al
-	jz	setvesa				# This is a text mode
-
-	movb	(%di), %al			# Check capabilities.
-	andb	$0x99, %al
-	cmpb	$0x99, %al
-	jnz	_setbad				# Doh! No linear frame buffer.
-
-	subb	$VIDEO_FIRST_VESA>>8, %bh
-	orw	$0x4000, %bx			# Use linear frame buffer
-	movw	$0x4f02, %ax			# VESA BIOS mode set call
-	int	$0x10
-	cmpw	$0x004f, %ax			# AL=4f if implemented
-	jnz	_setbad				# AH=0 if OK
-
-	movb	$1, graphic_mode		# flag graphic mode
-	movb	$0, do_restore			# no screen restore
-	stc
-	ret
-
-_setbad:	jmp	setbad          	# Ugly...
-
-# Recalculate vertical display end registers -- this fixes various
-# inconsistencies of extended modes on many adapters. Called when
-# the VIDEO_RECALC flag is set in the mode ID.
-
-setrec:	subb	$VIDEO_RECALC>>8, %ah		# Set the base mode
-	call	mode_set
-	jnc	rct3
-
-	movw	%gs:(0x485), %ax		# Font size in pixels
-	movb	%gs:(0x484), %bl		# Number of rows
-	incb	%bl
-	mulb	%bl				# Number of visible
-	decw	%ax				# scan lines - 1
-	movw	$0x3d4, %dx
-	movw	%ax, %bx
-	movb	$0x12, %al			# Lower 8 bits
-	movb	%bl, %ah
-	outw	%ax, %dx
-	movb	$0x07, %al		# Bits 8 and 9 in the overflow register
-	call	inidx
-	xchgb	%al, %ah
-	andb	$0xbd, %ah
-	shrb	%bh
-	jnc	rct1
-	orb	$0x02, %ah
-rct1:	shrb	%bh
-	jnc	rct2
-	orb	$0x40, %ah
-rct2:	movb	$0x07, %al
-	outw	%ax, %dx
-	stc
-rct3:	ret
-
-# Table of routines for setting of the special modes.
-spec_inits:
-	.word	set_80x25
-	.word	set_8pixel
-	.word	set_80x43
-	.word	set_80x28
-	.word	set_current
-	.word	set_80x30
-	.word	set_80x34
-	.word	set_80x60
-	.word	set_gfx
-
-# Set the 80x25 mode. If already set, do nothing.
-set_80x25:
-	movw	$0x5019, force_size		# Override possibly broken BIOS
-use_80x25:
-#ifdef CONFIG_VIDEO_400_HACK
-	movw	$0x1202, %ax			# Force 400 scan lines
-	movb	$0x30, %bl
-	int	$0x10
-#else
-	movb	$0x0f, %ah			# Get current mode ID
-	int	$0x10
-	cmpw	$0x5007, %ax	# Mode 7 (80x25 mono) is the only one available
-	jz	st80		# on CGA/MDA/HGA and is also available on EGAM
-
-	cmpw	$0x5003, %ax	# Unknown mode, force 80x25 color
-	jnz	force3
-
-st80:	cmpb	$0, adapter	# CGA/MDA/HGA => mode 3/7 is always 80x25
-	jz	set80
-
-	movb	%gs:(0x0484), %al	# This is EGA+ -- beware of 80x50 etc.
-	orb	%al, %al		# Some buggy BIOS'es set 0 rows
-	jz	set80
-	
-	cmpb	$24, %al		# It's hopefully correct
-	jz	set80
-#endif /* CONFIG_VIDEO_400_HACK */
-force3:	DO_STORE
-	movw	$0x0003, %ax			# Forced set
-	int	$0x10
-set80:	stc
-	ret
-
-# Set the 80x50/80x43 8-pixel mode. Simple BIOS calls.
-set_8pixel:
-	DO_STORE
-	call	use_80x25			# The base is 80x25
-set_8pt:
-	movw	$0x1112, %ax			# Use 8x8 font
-	xorb	%bl, %bl
-	int	$0x10
-	movw	$0x1200, %ax			# Use alternate print screen
-	movb	$0x20, %bl
-	int	$0x10
-	movw	$0x1201, %ax			# Turn off cursor emulation
-	movb	$0x34, %bl
-	int	$0x10
-	movb	$0x01, %ah			# Define cursor scan lines 6-7
-	movw	$0x0607, %cx
-	int	$0x10
-set_current:
-	stc
-	ret
-
-# Set the 80x28 mode. This mode works on all VGA's, because it's a standard
-# 80x25 mode with 14-point fonts instead of 16-point.
-set_80x28:
-	DO_STORE
-	call	use_80x25			# The base is 80x25
-set14:	movw	$0x1111, %ax			# Use 9x14 font
-	xorb	%bl, %bl
-	int	$0x10
-	movb	$0x01, %ah			# Define cursor scan lines 11-12
-	movw	$0x0b0c, %cx
-	int	$0x10
-	stc
-	ret
-
-# Set the 80x43 mode. This mode is works on all VGA's.
-# It's a 350-scanline mode with 8-pixel font.
-set_80x43:
-	DO_STORE
-	movw	$0x1201, %ax			# Set 350 scans
-	movb	$0x30, %bl
-	int	$0x10
-	movw	$0x0003, %ax			# Reset video mode
-	int	$0x10
-	jmp	set_8pt				# Use 8-pixel font
-
-# Set the 80x30 mode (all VGA's). 480 scanlines, 16-pixel font.
-set_80x30:
-	call	use_80x25			# Start with real 80x25
-	DO_STORE
-	movw	$0x3cc, %dx			# Get CRTC port
-	inb	%dx, %al
-	movb	$0xd4, %dl
-	rorb	%al				# Mono or color?
-	jc	set48a
-
-	movb	$0xb4, %dl
-set48a:	movw	$0x0c11, %ax		# Vertical sync end (also unlocks CR0-7)
- 	call	outidx
-	movw	$0x0b06, %ax			# Vertical total
- 	call	outidx
-	movw	$0x3e07, %ax			# (Vertical) overflow
- 	call	outidx
-	movw	$0xea10, %ax			# Vertical sync start
- 	call	outidx
-	movw	$0xdf12, %ax			# Vertical display end
-	call	outidx
-	movw	$0xe715, %ax			# Vertical blank start
- 	call	outidx
-	movw	$0x0416, %ax			# Vertical blank end
- 	call	outidx
-	pushw	%dx
-	movb	$0xcc, %dl			# Misc output register (read)
- 	inb	%dx, %al
- 	movb	$0xc2, %dl			# (write)
- 	andb	$0x0d, %al	# Preserve clock select bits and color bit
- 	orb	$0xe2, %al			# Set correct sync polarity
- 	outb	%al, %dx
-	popw	%dx
-	movw	$0x501e, force_size
-	stc					# That's all.
-	ret
-
-# Set the 80x34 mode (all VGA's). 480 scans, 14-pixel font.
-set_80x34:
-	call	set_80x30			# Set 480 scans
-	call	set14				# And 14-pt font
-	movw	$0xdb12, %ax			# VGA vertical display end
-	movw	$0x5022, force_size
-setvde:	call	outidx
-	stc
-	ret
-
-# Set the 80x60 mode (all VGA's). 480 scans, 8-pixel font.
-set_80x60:
-	call	set_80x30			# Set 480 scans
-	call	set_8pt				# And 8-pt font
-	movw	$0xdf12, %ax			# VGA vertical display end
-	movw	$0x503c, force_size
-	jmp	setvde
-
-# Special hack for ThinkPad graphics
-set_gfx:
-#ifdef CONFIG_VIDEO_GFX_HACK
-	movw	$VIDEO_GFX_BIOS_AX, %ax
-	movw	$VIDEO_GFX_BIOS_BX, %bx
-	int	$0x10
-	movw	$VIDEO_GFX_DUMMY_RESOLUTION, force_size
-	stc
-#endif
-	ret
-
-#ifdef CONFIG_VIDEO_RETAIN
-
-# Store screen contents to temporary buffer.
-store_screen:
-	cmpb	$0, do_restore			# Already stored?
-	jnz	stsr
-
-	testb	$CAN_USE_HEAP, loadflags	# Have we space for storing?
-	jz	stsr
-	
-	pushw	%ax
-	pushw	%bx
-	pushw	force_size			# Don't force specific size
-	movw	$0, force_size
-	call	mode_params			# Obtain params of current mode
-	popw	force_size
-	movb	%fs:(PARAM_VIDEO_LINES), %ah
-	movb	%fs:(PARAM_VIDEO_COLS), %al
-	movw	%ax, %bx			# BX=dimensions
-	mulb	%ah
-	movw	%ax, %cx			# CX=number of characters
-	addw	%ax, %ax			# Calculate image size
-	addw	$modelist+1024+4, %ax
-	cmpw	heap_end_ptr, %ax
-	jnc	sts1				# Unfortunately, out of memory
-
-	movw	%fs:(PARAM_CURSOR_POS), %ax	# Store mode params
-	leaw	modelist+1024, %di
-	stosw
-	movw	%bx, %ax
-	stosw
-	pushw	%ds				# Store the screen
-	movw	video_segment, %ds
-	xorw	%si, %si
-	rep
-	movsw
-	popw	%ds
-	incb	do_restore			# Screen will be restored later
-sts1:	popw	%bx
-	popw	%ax
-stsr:	ret
-
-# Restore screen contents from temporary buffer.
-restore_screen:
-	cmpb	$0, do_restore			# Has the screen been stored?
-	jz	res1
-
-	call	mode_params			# Get parameters of current mode
-	movb	%fs:(PARAM_VIDEO_LINES), %cl
-	movb	%fs:(PARAM_VIDEO_COLS), %ch
-	leaw	modelist+1024, %si		# Screen buffer
-	lodsw					# Set cursor position
-	movw	%ax, %dx
-	cmpb	%cl, %dh
-	jc	res2
-	
-	movb	%cl, %dh
-	decb	%dh
-res2:	cmpb	%ch, %dl
-	jc	res3
-	
-	movb	%ch, %dl
-	decb	%dl
-res3:	movb	$0x02, %ah
-	movb	$0x00, %bh
-	int	$0x10
-	lodsw					# Display size
-	movb	%ah, %dl			# DL=number of lines
-	movb	$0, %ah				# BX=phys. length of orig. line
-	movw	%ax, %bx
-	cmpb	%cl, %dl			# Too many?
-	jc	res4
-
-	pushw	%ax
-	movb	%dl, %al
-	subb	%cl, %al
-	mulb	%bl
-	addw	%ax, %si
-	addw	%ax, %si
-	popw	%ax
-	movb	%cl, %dl
-res4:	cmpb	%ch, %al			# Too wide?
-	jc	res5
-	
-	movb	%ch, %al			# AX=width of src. line
-res5:	movb	$0, %cl
-	xchgb	%ch, %cl
-	movw	%cx, %bp			# BP=width of dest. line
-	pushw	%es
-	movw	video_segment, %es
-	xorw	%di, %di			# Move the data
-	addw	%bx, %bx			# Convert BX and BP to _bytes_
-	addw	%bp, %bp
-res6:	pushw	%si
-	pushw	%di
-	movw	%ax, %cx
-	rep
-	movsw
-	popw	%di
-	popw	%si
-	addw	%bp, %di
-	addw	%bx, %si
-	decb	%dl
-	jnz	res6
-	
-	popw	%es				# Done
-res1:	ret
-#endif /* CONFIG_VIDEO_RETAIN */
-
-# Write to indexed VGA register (AL=index, AH=data, DX=index reg. port)
-outidx:	outb	%al, %dx
-	pushw	%ax
-	movb	%ah, %al
-	incw	%dx
-	outb	%al, %dx
-	decw	%dx
-	popw	%ax
-	ret
-
-# Build the table of video modes (stored after the setup.S code at the
-# `modelist' label. Each video mode record looks like:
-#	.word	MODE-ID		(our special mode ID (see above))
-#	.byte	rows		(number of rows)
-#	.byte	columns		(number of columns)
-# Returns address of the end of the table in DI, the end is marked
-# with a ASK_VGA ID.
-mode_table:
-	movw	mt_end, %di			# Already filled?
-	orw	%di, %di
-	jnz	mtab1x
-	
-	leaw	modelist, %di			# Store standard modes:
-	movl	$VIDEO_80x25 + 0x50190000, %eax	# The 80x25 mode (ALL)
-	stosl
-	movb	adapter, %al			# CGA/MDA/HGA -- no more modes
-	orb	%al, %al
-	jz	mtabe
-	
-	decb	%al
-	jnz	mtabv
-	
-	movl	$VIDEO_8POINT + 0x502b0000, %eax	# The 80x43 EGA mode
-	stosl
-	jmp	mtabe
-
-mtab1x:	jmp	mtab1
-
-mtabv:	leaw	vga_modes, %si			# All modes for std VGA
-	movw	$vga_modes_end-vga_modes, %cx
-	rep	# I'm unable to use movsw as I don't know how to store a half
-	movsb	# of the expression above to cx without using explicit shr.
-
-	cmpb	$0, scanning			# Mode scan requested?
-	jz	mscan1
-	
-	call	mode_scan
-mscan1:
-
-#ifdef CONFIG_VIDEO_LOCAL
-	call	local_modes
-#endif /* CONFIG_VIDEO_LOCAL */
-
-#ifdef CONFIG_VIDEO_VESA
-	call	vesa_modes			# Detect VESA VGA modes
-#endif /* CONFIG_VIDEO_VESA */
-
-#ifdef CONFIG_VIDEO_SVGA
-	cmpb	$0, scanning			# Bypass when scanning
-	jnz	mscan2
-	
-	call	svga_modes			# Detect SVGA cards & modes
-mscan2:
-#endif /* CONFIG_VIDEO_SVGA */
-
-mtabe:
-
-#ifdef CONFIG_VIDEO_COMPACT
-	leaw	modelist, %si
-	movw	%di, %dx
-	movw	%si, %di
-cmt1:	cmpw	%dx, %si			# Scan all modes
-	jz	cmt2
-
-	leaw	modelist, %bx			# Find in previous entries
-	movw	2(%si), %cx
-cmt3:	cmpw	%bx, %si
-	jz	cmt4
-
-	cmpw	2(%bx), %cx			# Found => don't copy this entry
-	jz	cmt5
-
-	addw	$4, %bx
-	jmp	cmt3
-
-cmt4:	movsl					# Copy entry
-	jmp	cmt1
-
-cmt5:	addw	$4, %si				# Skip entry
-	jmp	cmt1
-
-cmt2:
-#endif	/* CONFIG_VIDEO_COMPACT */
-
-	movw	$ASK_VGA, (%di)			# End marker
-	movw	%di, mt_end
-mtab1:	leaw	modelist, %si			# SI=mode list, DI=list end
-ret0:	ret
-
-# Modes usable on all standard VGAs
-vga_modes:
-	.word	VIDEO_8POINT
-	.word	0x5032				# 80x50
-	.word	VIDEO_80x43
-	.word	0x502b				# 80x43
-	.word	VIDEO_80x28
-	.word	0x501c				# 80x28
-	.word	VIDEO_80x30
-	.word	0x501e				# 80x30
-	.word	VIDEO_80x34
-	.word	0x5022				# 80x34
-	.word	VIDEO_80x60
-	.word	0x503c				# 80x60
-#ifdef CONFIG_VIDEO_GFX_HACK
-	.word	VIDEO_GFX_HACK
-	.word	VIDEO_GFX_DUMMY_RESOLUTION
-#endif
-
-vga_modes_end:
-# Detect VESA modes.
-
-#ifdef CONFIG_VIDEO_VESA
-vesa_modes:
-	cmpb	$2, adapter			# VGA only
-	jnz	ret0
-
-	movw	%di, %bp			# BP=original mode table end
-	addw	$0x200, %di			# Buffer space
-	movw	$0x4f00, %ax			# VESA Get card info call
-	int	$0x10
-	movw	%bp, %di
-	cmpw	$0x004f, %ax			# Successful?
-	jnz	ret0
-	
-	cmpw	$0x4556, 0x200(%di)
-	jnz	ret0
-	
-	cmpw	$0x4153, 0x202(%di)
-	jnz	ret0
-	
-	movw	$vesa_name, card_name		# Set name to "VESA VGA"
-	pushw	%gs
-	lgsw	0x20e(%di), %si			# GS:SI=mode list
-	movw	$128, %cx			# Iteration limit
-vesa1:
-# gas version 2.9.1, using BFD version 2.9.1.0.23 buggers the next inst.
-# XXX:	lodsw	%gs:(%si), %ax			# Get next mode in the list
-	gs; lodsw
-	cmpw	$0xffff, %ax			# End of the table?
-	jz	vesar
-	
-	cmpw	$0x0080, %ax			# Check validity of mode ID
-	jc	vesa2
-	
-	orb	%ah, %ah		# Valid IDs: 0x0000-0x007f/0x0100-0x07ff
-	jz	vesan			# Certain BIOSes report 0x80-0xff!
-
-	cmpw	$0x0800, %ax
-	jnc	vesae
-
-vesa2:	pushw	%cx
-	movw	%ax, %cx			# Get mode information structure
-	movw	$0x4f01, %ax
-	int	$0x10
-	movw	%cx, %bx			# BX=mode number
-	addb	$VIDEO_FIRST_VESA>>8, %bh
-	popw	%cx
-	cmpw	$0x004f, %ax
-	jnz	vesan			# Don't report errors (buggy BIOSES)
-
-	movb	(%di), %al			# Check capabilities. We require
-	andb	$0x19, %al			# a color text mode.
-	cmpb	$0x09, %al
-	jnz	vesan
-	
-	cmpw	$0xb800, 8(%di)		# Standard video memory address required
-	jnz	vesan
-
-	testb	$2, (%di)			# Mode characteristics supplied?
-	movw	%bx, (%di)			# Store mode number
-	jz	vesa3
-	
-	xorw	%dx, %dx
-	movw	0x12(%di), %bx			# Width
-	orb	%bh, %bh
-	jnz	vesan
-	
-	movb	%bl, 0x3(%di)
-	movw	0x14(%di), %ax			# Height
-	orb	%ah, %ah
-	jnz	vesan
-	
-	movb	%al, 2(%di)
-	mulb	%bl
-	cmpw	$8193, %ax		# Small enough for Linux console driver?
-	jnc	vesan
-
-	jmp	vesaok
-
-vesa3:	subw	$0x8108, %bx	# This mode has no detailed info specified,
-	jc	vesan		# so it must be a standard VESA mode.
-
-	cmpw	$5, %bx
-	jnc	vesan
-
-	movw	vesa_text_mode_table(%bx), %ax
-	movw	%ax, 2(%di)
-vesaok:	addw	$4, %di				# The mode is valid. Store it.
-vesan:	loop	vesa1			# Next mode. Limit exceeded => error
-vesae:	leaw	vesaer, %si
-	call	prtstr
-	movw	%bp, %di			# Discard already found modes.
-vesar:	popw	%gs
-	ret
-
-# Dimensions of standard VESA text modes
-vesa_text_mode_table:
-	.byte	60, 80				# 0108
-	.byte	25, 132				# 0109
-	.byte	43, 132				# 010A
-	.byte	50, 132				# 010B
-	.byte	60, 132				# 010C
-#endif	/* CONFIG_VIDEO_VESA */
-
-# Scan for video modes. A bit dirty, but should work.
-mode_scan:
-	movw	$0x0100, %cx			# Start with mode 0
-scm1:	movb	$0, %ah				# Test the mode
-	movb	%cl, %al
-	int	$0x10
-	movb	$0x0f, %ah
-	int	$0x10
-	cmpb	%cl, %al
-	jnz	scm2				# Mode not set
-
-	movw	$0x3c0, %dx			# Test if it's a text mode
-	movb	$0x10, %al			# Mode bits
-	call	inidx
-	andb	$0x03, %al
-	jnz	scm2
-	
-	movb	$0xce, %dl			# Another set of mode bits
-	movb	$0x06, %al
-	call	inidx
-	shrb	%al
-	jc	scm2
-	
-	movb	$0xd4, %dl			# Cursor location
-	movb	$0x0f, %al
-	call	inidx
-	orb	%al, %al
-	jnz	scm2
-	
-	movw	%cx, %ax			# Ok, store the mode
-	stosw
-	movb	%gs:(0x484), %al		# Number of rows
-	incb	%al
-	stosb
-	movw	%gs:(0x44a), %ax		# Number of columns
-	stosb
-scm2:	incb	%cl
-	jns	scm1
-	
-	movw	$0x0003, %ax			# Return back to mode 3
-	int	$0x10
-	ret
-
-tstidx:	outw	%ax, %dx			# OUT DX,AX and inidx
-inidx:	outb	%al, %dx			# Read from indexed VGA register
-	incw	%dx			# AL=index, DX=index reg port -> AL=data
-	inb	%dx, %al
-	decw	%dx
-	ret
-
-# Try to detect type of SVGA card and supply (usually approximate) video
-# mode table for it.
-
-#ifdef CONFIG_VIDEO_SVGA
-svga_modes:
-	leaw	svga_table, %si			# Test all known SVGA adapters
-dosvga:	lodsw
-	movw	%ax, %bp			# Default mode table
-	orw	%ax, %ax
-	jz	didsv1
-
-	lodsw					# Pointer to test routine
-	pushw	%si
-	pushw	%di
-	pushw	%es
-	movw	$0xc000, %bx
-	movw	%bx, %es
-	call	*%ax				# Call test routine
-	popw	%es
-	popw	%di
-	popw	%si
-	orw	%bp, %bp
-	jz	dosvga
-	
-	movw	%bp, %si			# Found, copy the modes
-	movb	svga_prefix, %ah
-cpsvga:	lodsb
-	orb	%al, %al
-	jz	didsv
-	
-	stosw
-	movsw
-	jmp	cpsvga
-
-didsv:	movw	%si, card_name			# Store pointer to card name
-didsv1:	ret
-
-# Table of all known SVGA cards. For each card, we store a pointer to
-# a table of video modes supported by the card and a pointer to a routine
-# used for testing of presence of the card. The video mode table is always
-# followed by the name of the card or the chipset.
-svga_table:
-	.word	ati_md, ati_test
-	.word	oak_md, oak_test
-	.word	paradise_md, paradise_test
-	.word	realtek_md, realtek_test
-	.word	s3_md, s3_test
-	.word	chips_md, chips_test
-	.word	video7_md, video7_test
-	.word	cirrus5_md, cirrus5_test
-	.word	cirrus6_md, cirrus6_test
-	.word	cirrus1_md, cirrus1_test
-	.word	ahead_md, ahead_test
-	.word	everex_md, everex_test
-	.word	genoa_md, genoa_test
-	.word	trident_md, trident_test
-	.word	tseng_md, tseng_test
-	.word	0
-
-# Test routines and mode tables:
-
-# S3 - The test algorithm was taken from the SuperProbe package
-# for XFree86 1.2.1. Report bugs to Christoph.Niemann@linux.org
-s3_test:
-	movw	$0x0f35, %cx	# we store some constants in cl/ch
-	movw	$0x03d4, %dx
-	movb	$0x38, %al
-	call	inidx
-	movb	%al, %bh	# store current CRT-register 0x38
-	movw	$0x0038, %ax
-	call	outidx		# disable writing to special regs
-	movb	%cl, %al	# check whether we can write special reg 0x35
-	call	inidx
-	movb	%al, %bl	# save the current value of CRT reg 0x35
-	andb	$0xf0, %al	# clear bits 0-3
-	movb	%al, %ah
-	movb	%cl, %al	# and write it to CRT reg 0x35
-	call	outidx
-	call	inidx		# now read it back
-	andb	%ch, %al	# clear the upper 4 bits
-	jz	s3_2		# the first test failed. But we have a
-
-	movb	%bl, %ah	# second chance
-	movb	%cl, %al
-	call	outidx
-	jmp	s3_1		# do the other tests
-
-s3_2:	movw	%cx, %ax	# load ah with 0xf and al with 0x35
-	orb	%bl, %ah	# set the upper 4 bits of ah with the orig value
-	call	outidx		# write ...
-	call	inidx		# ... and reread 
-	andb	%cl, %al	# turn off the upper 4 bits
-	pushw	%ax
-	movb	%bl, %ah	# restore old value in register 0x35
-	movb	%cl, %al
-	call	outidx
-	popw	%ax
-	cmpb	%ch, %al	# setting lower 4 bits was successful => bad
-	je	no_s3		# writing is allowed => this is not an S3
-
-s3_1:	movw	$0x4838, %ax	# allow writing to special regs by putting
-	call	outidx		# magic number into CRT-register 0x38
-	movb	%cl, %al	# check whether we can write special reg 0x35
-	call	inidx
-	movb	%al, %bl
-	andb	$0xf0, %al
-	movb	%al, %ah
-	movb	%cl, %al
-	call	outidx
-	call	inidx
-	andb	%ch, %al
-	jnz	no_s3		# no, we can't write => no S3
-
-	movw	%cx, %ax
-	orb	%bl, %ah
-	call	outidx
-	call	inidx
-	andb	%ch, %al
-	pushw	%ax
-	movb	%bl, %ah	# restore old value in register 0x35
-	movb	%cl, %al
-	call	outidx
-	popw	%ax
-	cmpb	%ch, %al
-	jne	no_s31		# writing not possible => no S3
-	movb	$0x30, %al
-	call	inidx		# now get the S3 id ...
-	leaw	idS3, %di
-	movw	$0x10, %cx
-	repne
-	scasb
-	je	no_s31
-
-	movb	%bh, %ah
-	movb	$0x38, %al
-	jmp	s3rest
-
-no_s3:	movb	$0x35, %al	# restore CRT register 0x35
-	movb	%bl, %ah
-	call	outidx
-no_s31:	xorw	%bp, %bp	# Detection failed
-s3rest:	movb	%bh, %ah
-	movb	$0x38, %al	# restore old value of CRT register 0x38
-	jmp	outidx
-
-idS3:	.byte	0x81, 0x82, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95
-	.byte	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa8, 0xb0
-
-s3_md:	.byte	0x54, 0x2b, 0x84
-	.byte	0x55, 0x19, 0x84
-	.byte	0
-	.ascii	"S3"
-	.byte	0
-
-# ATI cards.
-ati_test:
-	leaw 	idati, %si
-	movw	$0x31, %di
-	movw	$0x09, %cx
-	repe
-	cmpsb
-	je	atiok
-
-	xorw	%bp, %bp
-atiok:	ret
-
-idati:	.ascii	"761295520"
-
-ati_md:	.byte	0x23, 0x19, 0x84
-	.byte	0x33, 0x2c, 0x84
-	.byte	0x22, 0x1e, 0x64
-	.byte	0x21, 0x19, 0x64
-	.byte	0x58, 0x21, 0x50
-	.byte	0x5b, 0x1e, 0x50
-	.byte	0
-	.ascii	"ATI"
-	.byte	0
-
-# AHEAD
-ahead_test:
-	movw	$0x200f, %ax
-	movw	$0x3ce, %dx
-	outw	%ax, %dx
-	incw	%dx
-	inb	%dx, %al
-	cmpb	$0x20, %al
-	je	isahed
-
-	cmpb	$0x21, %al
-	je	isahed
-	
-	xorw	%bp, %bp
-isahed:	ret
-
-ahead_md:
-	.byte	0x22, 0x2c, 0x84
-	.byte	0x23, 0x19, 0x84
-	.byte	0x24, 0x1c, 0x84
-	.byte	0x2f, 0x32, 0xa0
-	.byte	0x32, 0x22, 0x50
-	.byte	0x34, 0x42, 0x50
-	.byte	0
-	.ascii	"Ahead"
-	.byte	0
-
-# Chips & Tech.
-chips_test:
-	movw	$0x3c3, %dx
-	inb	%dx, %al
-	orb	$0x10, %al
-	outb	%al, %dx
-	movw	$0x104, %dx
-	inb	%dx, %al
-	movb	%al, %bl
-	movw	$0x3c3, %dx
-	inb	%dx, %al
-	andb	$0xef, %al
-	outb	%al, %dx
-	cmpb	$0xa5, %bl
-	je	cantok
-	
-	xorw	%bp, %bp
-cantok:	ret
-
-chips_md:
-	.byte	0x60, 0x19, 0x84
-	.byte	0x61, 0x32, 0x84
-	.byte	0
-	.ascii	"Chips & Technologies"
-	.byte	0
-
-# Cirrus Logic 5X0
-cirrus1_test:
-	movw	$0x3d4, %dx
-	movb	$0x0c, %al
-	outb	%al, %dx
-	incw	%dx
-	inb	%dx, %al
-	movb	%al, %bl
-	xorb	%al, %al
-	outb	%al, %dx
-	decw	%dx
-	movb	$0x1f, %al
-	outb	%al, %dx
-	incw	%dx
-	inb	%dx, %al
-	movb	%al, %bh
-	xorb	%ah, %ah
-	shlb	$4, %al
-	movw	%ax, %cx
-	movb	%bh, %al
-	shrb	$4, %al
-	addw	%ax, %cx
-	shlw	$8, %cx
-	addw	$6, %cx
-	movw	%cx, %ax
-	movw	$0x3c4, %dx
-	outw	%ax, %dx
-	incw	%dx
-	inb	%dx, %al
-	andb	%al, %al
-	jnz	nocirr
-	
-	movb	%bh, %al
-	outb	%al, %dx
-	inb	%dx, %al
-	cmpb	$0x01, %al
-	je	iscirr
-
-nocirr:	xorw	%bp, %bp
-iscirr: movw	$0x3d4, %dx
-	movb	%bl, %al
-	xorb	%ah, %ah
-	shlw	$8, %ax
-	addw	$0x0c, %ax
-	outw	%ax, %dx
-	ret
-
-cirrus1_md:
-	.byte	0x1f, 0x19, 0x84
-	.byte	0x20, 0x2c, 0x84
-	.byte	0x22, 0x1e, 0x84
-	.byte	0x31, 0x25, 0x64
-	.byte	0
-	.ascii	"Cirrus Logic 5X0"
-	.byte	0
-
-# Cirrus Logic 54XX
-cirrus5_test:
-	movw	$0x3c4, %dx
-	movb	$6, %al
-	call	inidx
-	movb	%al, %bl			# BL=backup
-	movw	$6, %ax
-	call	tstidx
-	cmpb	$0x0f, %al
-	jne	c5fail
-	
-	movw	$0x1206, %ax
-	call	tstidx
-	cmpb	$0x12, %al
-	jne	c5fail
-	
-	movb	$0x1e, %al
-	call	inidx
-	movb	%al, %bh
-	movb	%bh, %ah
-	andb	$0xc0, %ah
-	movb	$0x1e, %al
-	call	tstidx
-	andb	$0x3f, %al
-	jne	c5xx
-	
-	movb	$0x1e, %al
-	movb	%bh, %ah
-	orb	$0x3f, %ah
-	call	tstidx
-	xorb	$0x3f, %al
-	andb	$0x3f, %al
-c5xx:	pushf
-	movb	$0x1e, %al
-	movb	%bh, %ah
-	outw	%ax, %dx
-	popf
-	je	c5done
-
-c5fail:	xorw	%bp, %bp
-c5done:	movb	$6, %al
-	movb	%bl, %ah
-	outw	%ax, %dx
-	ret
-
-cirrus5_md:
-	.byte	0x14, 0x19, 0x84
-	.byte	0x54, 0x2b, 0x84
-	.byte	0
-	.ascii	"Cirrus Logic 54XX"
-	.byte	0
-
-# Cirrus Logic 64XX -- no known extra modes, but must be identified, because
-# it's misidentified by the Ahead test.
-cirrus6_test:
-	movw	$0x3ce, %dx
-	movb	$0x0a, %al
-	call	inidx
-	movb	%al, %bl	# BL=backup
-	movw	$0xce0a, %ax
-	call	tstidx
-	orb	%al, %al
-	jne	c2fail
-	
-	movw	$0xec0a, %ax
-	call	tstidx
-	cmpb	$0x01, %al
-	jne	c2fail
-	
-	movb	$0xaa, %al
-	call	inidx		# 4X, 5X, 7X and 8X are valid 64XX chip ID's. 
-	shrb	$4, %al
-	subb	$4, %al
-	jz	c6done
-	
-	decb	%al
-	jz	c6done
-	
-	subb	$2, %al
-	jz	c6done
-	
-	decb	%al
-	jz	c6done
-	
-c2fail:	xorw	%bp, %bp
-c6done:	movb	$0x0a, %al
-	movb	%bl, %ah
-	outw	%ax, %dx
-	ret
-
-cirrus6_md:
-	.byte	0
-	.ascii	"Cirrus Logic 64XX"
-	.byte	0
-
-# Everex / Trident
-everex_test:
-	movw	$0x7000, %ax
-	xorw	%bx, %bx
-	int	$0x10
-	cmpb	$0x70, %al
-	jne	noevrx
-	
-	shrw	$4, %dx
-	cmpw	$0x678, %dx
-	je	evtrid
-	
-	cmpw	$0x236, %dx
-	jne	evrxok
-
-evtrid:	leaw	trident_md, %bp
-evrxok:	ret
-
-noevrx:	xorw	%bp, %bp
-	ret
-
-everex_md:
-	.byte	0x03, 0x22, 0x50
-	.byte	0x04, 0x3c, 0x50
-	.byte	0x07, 0x2b, 0x64
-	.byte	0x08, 0x4b, 0x64
-	.byte	0x0a, 0x19, 0x84
-	.byte	0x0b, 0x2c, 0x84
-	.byte	0x16, 0x1e, 0x50
-	.byte	0x18, 0x1b, 0x64
-	.byte	0x21, 0x40, 0xa0
-	.byte	0x40, 0x1e, 0x84
-	.byte	0
-	.ascii	"Everex/Trident"
-	.byte	0
-
-# Genoa.
-genoa_test:
-	leaw	idgenoa, %si			# Check Genoa 'clues'
-	xorw	%ax, %ax
-	movb	%es:(0x37), %al
-	movw	%ax, %di
-	movw	$0x04, %cx
-	decw	%si
-	decw	%di
-l1:	incw	%si
-	incw	%di
-	movb	(%si), %al
-	testb	%al, %al
-	jz	l2
-
-	cmpb	%es:(%di), %al
-l2:	loope 	l1
-	orw	%cx, %cx
-	je	isgen
-	
-	xorw	%bp, %bp
-isgen:	ret
-
-idgenoa: .byte	0x77, 0x00, 0x99, 0x66
-
-genoa_md:
-	.byte	0x58, 0x20, 0x50
-	.byte	0x5a, 0x2a, 0x64
-	.byte	0x60, 0x19, 0x84
-	.byte	0x61, 0x1d, 0x84
-	.byte	0x62, 0x20, 0x84
-	.byte	0x63, 0x2c, 0x84
-	.byte	0x64, 0x3c, 0x84
-	.byte	0x6b, 0x4f, 0x64
-	.byte	0x72, 0x3c, 0x50
-	.byte	0x74, 0x42, 0x50
-	.byte	0x78, 0x4b, 0x64
-	.byte	0
-	.ascii	"Genoa"
-	.byte	0
-
-# OAK
-oak_test:
-	leaw	idoakvga, %si
-	movw	$0x08, %di
-	movw	$0x08, %cx
-	repe
-	cmpsb
-	je	isoak
-	
-	xorw	%bp, %bp
-isoak:	ret
-
-idoakvga: .ascii  "OAK VGA "
-
-oak_md: .byte	0x4e, 0x3c, 0x50
-	.byte	0x4f, 0x3c, 0x84
-	.byte	0x50, 0x19, 0x84
-	.byte	0x51, 0x2b, 0x84
-	.byte	0
-	.ascii	"OAK"
-	.byte	0
-
-# WD Paradise.
-paradise_test:
-	leaw	idparadise, %si
-	movw	$0x7d, %di
-	movw	$0x04, %cx
-	repe
-	cmpsb
-	je	ispara
-	
-	xorw	%bp, %bp
-ispara:	ret
-
-idparadise:	.ascii	"VGA="
-
-paradise_md:
-	.byte	0x41, 0x22, 0x50
-	.byte	0x47, 0x1c, 0x84
-	.byte	0x55, 0x19, 0x84
-	.byte	0x54, 0x2c, 0x84
-	.byte	0
-	.ascii	"Paradise"
-	.byte	0
-
-# Trident.
-trident_test:
-	movw	$0x3c4, %dx
-	movb	$0x0e, %al
-	outb	%al, %dx
-	incw	%dx
-	inb	%dx, %al
-	xchgb	%al, %ah
-	xorb	%al, %al
-	outb	%al, %dx
-	inb	%dx, %al
-	xchgb	%ah, %al
-	movb	%al, %bl	# Strange thing ... in the book this wasn't
-	andb	$0x02, %bl	# necessary but it worked on my card which
-	jz	setb2		# is a trident. Without it the screen goes
-				# blurred ...
-	andb	$0xfd, %al
-	jmp	clrb2		
-
-setb2:	orb	$0x02, %al	
-clrb2:	outb	%al, %dx
-	andb	$0x0f, %ah
-	cmpb	$0x02, %ah
-	je	istrid
-
-	xorw	%bp, %bp
-istrid:	ret
-
-trident_md:
-	.byte	0x50, 0x1e, 0x50
-	.byte	0x51, 0x2b, 0x50
-	.byte	0x52, 0x3c, 0x50
-	.byte	0x57, 0x19, 0x84
-	.byte	0x58, 0x1e, 0x84
-	.byte	0x59, 0x2b, 0x84
-	.byte	0x5a, 0x3c, 0x84
-	.byte	0
-	.ascii	"Trident"
-	.byte	0
-
-# Tseng.
-tseng_test:
-	movw	$0x3cd, %dx
-	inb	%dx, %al	# Could things be this simple ! :-)
-	movb	%al, %bl
-	movb	$0x55, %al
-	outb	%al, %dx
-	inb	%dx, %al
-	movb	%al, %ah
-	movb	%bl, %al
-	outb	%al, %dx
-	cmpb	$0x55, %ah
- 	je	istsen
-
-isnot:	xorw	%bp, %bp
-istsen:	ret
-
-tseng_md:
-	.byte	0x26, 0x3c, 0x50
-	.byte	0x2a, 0x28, 0x64
-	.byte	0x23, 0x19, 0x84
-	.byte	0x24, 0x1c, 0x84
-	.byte	0x22, 0x2c, 0x84
-	.byte	0x21, 0x3c, 0x84
-	.byte	0
-	.ascii	"Tseng"
-	.byte	0
-
-# Video7.
-video7_test:
-	movw	$0x3cc, %dx
-	inb	%dx, %al
-	movw	$0x3b4, %dx
-	andb	$0x01, %al
-	jz	even7
-
-	movw	$0x3d4, %dx
-even7:	movb	$0x0c, %al
-	outb	%al, %dx
-	incw	%dx
-	inb	%dx, %al
-	movb	%al, %bl
-	movb	$0x55, %al
-	outb	%al, %dx
-	inb	%dx, %al
-	decw	%dx
-	movb	$0x1f, %al
-	outb	%al, %dx
-	incw	%dx
-	inb	%dx, %al
-	movb	%al, %bh
-	decw	%dx
-	movb	$0x0c, %al
-	outb	%al, %dx
-	incw	%dx
-	movb	%bl, %al
-	outb	%al, %dx
-	movb	$0x55, %al
-	xorb	$0xea, %al
-	cmpb	%bh, %al
-	jne	isnot
-	
-	movb	$VIDEO_FIRST_V7>>8, svga_prefix # Use special mode switching
-	ret
-
-video7_md:
-	.byte	0x40, 0x2b, 0x50
-	.byte	0x43, 0x3c, 0x50
-	.byte	0x44, 0x3c, 0x64
-	.byte	0x41, 0x19, 0x84
-	.byte	0x42, 0x2c, 0x84
-	.byte	0x45, 0x1c, 0x84
-	.byte	0
-	.ascii	"Video 7"
-	.byte	0
-
-# Realtek VGA
-realtek_test:
-	leaw	idrtvga, %si
-	movw	$0x45, %di
-	movw	$0x0b, %cx
-	repe
-	cmpsb
-	je	isrt
-	
-	xorw	%bp, %bp
-isrt:	ret
-
-idrtvga:	.ascii	"REALTEK VGA"
-
-realtek_md:
-	.byte	0x1a, 0x3c, 0x50
-	.byte	0x1b, 0x19, 0x84
-	.byte	0x1c, 0x1e, 0x84
-	.byte	0x1d, 0x2b, 0x84
-	.byte	0x1e, 0x3c, 0x84
-	.byte	0
-	.ascii	"REALTEK"
-	.byte	0
-
-#endif	/* CONFIG_VIDEO_SVGA */
-
-# User-defined local mode table (VGA only)
-#ifdef CONFIG_VIDEO_LOCAL
-local_modes:
-	leaw	local_mode_table, %si
-locm1:	lodsw
-	orw	%ax, %ax
-	jz	locm2
-	
-	stosw
-	movsw
-	jmp	locm1
-
-locm2:	ret
-
-# This is the table of local video modes which can be supplied manually
-# by the user. Each entry consists of mode ID (word) and dimensions
-# (byte for column count and another byte for row count). These modes
-# are placed before all SVGA and VESA modes and override them if table
-# compacting is enabled. The table must end with a zero word followed
-# by NUL-terminated video adapter name.
-local_mode_table:
-	.word	0x0100				# Example: 40x25
-	.byte	25,40
-	.word	0
-	.ascii	"Local"
-	.byte	0
-#endif	/* CONFIG_VIDEO_LOCAL */
-
-# Read a key and return the ASCII code in al, scan code in ah
-getkey:	xorb	%ah, %ah
-	int	$0x16
-	ret
-
-# Read a key with a timeout of 30 seconds.
-# The hardware clock is used to get the time.
-getkt:	call	gettime
-	addb	$30, %al			# Wait 30 seconds
-	cmpb	$60, %al
-	jl	lminute
-
-	subb	$60, %al
-lminute:
-	movb	%al, %cl
-again:	movb	$0x01, %ah
-	int	$0x16
-	jnz	getkey				# key pressed, so get it
-
-	call	gettime
-	cmpb	%cl, %al
-	jne	again
-
-	movb	$0x20, %al			# timeout, return `space'
-	ret
-
-# Flush the keyboard buffer
-flush:	movb	$0x01, %ah
-	int	$0x16
-	jz	empty
-	
-	xorb	%ah, %ah
-	int	$0x16
-	jmp	flush
-
-empty:	ret
-
-# Print hexadecimal number.
-prthw:	pushw	%ax
-	movb	%ah, %al
-	call	prthb
-	popw	%ax
-prthb:	pushw	%ax
-	shrb	$4, %al
-	call	prthn
-	popw	%ax
-	andb	$0x0f, %al
-prthn:	cmpb	$0x0a, %al
-	jc	prth1
-
-	addb	$0x07, %al
-prth1:	addb	$0x30, %al
-	jmp	prtchr
-
-# Print decimal number in al
-prtdec:	pushw	%ax
-	pushw	%cx
-	xorb	%ah, %ah
-	movb	$0x0a, %cl
-	idivb	%cl
-	cmpb	$0x09, %al
-	jbe	lt100
-
-	call	prtdec
-	jmp	skip10
-
-lt100:	addb	$0x30, %al
-	call	prtchr
-skip10:	movb	%ah, %al
-	addb	$0x30, %al
-	call	prtchr	
-	popw	%cx
-	popw	%ax
-	ret
-
-store_edid:
-#ifdef CONFIG_FIRMWARE_EDID
-	pushw	%es				# just save all registers
-	pushw	%ax
-	pushw	%bx
-	pushw   %cx
-	pushw	%dx
-	pushw   %di
-
-	pushw	%fs
-	popw    %es
-
-	movl	$0x13131313, %eax		# memset block with 0x13
-	movw    $32, %cx
-	movw	$0x140, %di
-	cld
-	rep
-	stosl
-
-	cmpw	$0x0200, vbe_version		# only do EDID on >= VBE2.0
-	jl	no_edid
-
-	pushw   %es				# save ES
-	xorw    %di, %di                        # Report Capability
-	pushw   %di
-	popw    %es                             # ES:DI must be 0:0
-	movw	$0x4f15, %ax
-	xorw	%bx, %bx
-	xorw	%cx, %cx
-	int	$0x10
-	popw    %es                             # restore ES
-
-	cmpb    $0x00, %ah                      # call successful
-	jne     no_edid
-
-	cmpb    $0x4f, %al                      # function supported
-	jne     no_edid
-
-	movw	$0x4f15, %ax                    # do VBE/DDC
-	movw	$0x01, %bx
-	movw	$0x00, %cx
-	movw    $0x00, %dx
-	movw	$0x140, %di
-	int	$0x10
-
-no_edid:
-	popw	%di				# restore all registers
-	popw	%dx
-	popw	%cx
-	popw	%bx
-	popw	%ax
-	popw	%es
-#endif
-	ret
-
-# VIDEO_SELECT-only variables
-mt_end:		.word	0	# End of video mode table if built
-edit_buf:	.space	6	# Line editor buffer
-card_name:	.word	0	# Pointer to adapter name
-scanning:	.byte	0	# Performing mode scan
-do_restore:	.byte	0	# Screen contents altered during mode change
-svga_prefix:	.byte	VIDEO_FIRST_BIOS>>8	# Default prefix for BIOS modes
-graphic_mode:	.byte	0	# Graphic mode with a linear frame buffer
-dac_size:	.byte	6	# DAC bit depth
-vbe_version:	.word	0	# VBE bios version
-
-# Status messages
-keymsg:		.ascii	"Press <RETURN> to see video modes available, "
-		.ascii	"<SPACE> to continue or wait 30 secs"
-		.byte	0x0d, 0x0a, 0
-
-listhdr:	.byte	0x0d, 0x0a
-		.ascii	"Mode:    COLSxROWS:"
-
-crlft:		.byte	0x0d, 0x0a, 0
-
-prompt:		.byte	0x0d, 0x0a
-		.asciz	"Enter mode number or `scan': "
-
-unknt:		.asciz	"Unknown mode ID. Try again."
-
-badmdt:		.ascii	"You passed an undefined mode number."
-		.byte	0x0d, 0x0a, 0
-
-vesaer:		.ascii	"Error: Scanning of VESA modes failed. Please "
-		.ascii	"report to <mj@ucw.cz>."
-		.byte	0x0d, 0x0a, 0
-
-old_name:	.asciz	"CGA/MDA/HGA"
-
-ega_name:	.asciz	"EGA"
-
-svga_name:	.ascii	" "
-
-vga_name:	.asciz	"VGA"
-
-vesa_name:	.asciz	"VESA"
-
-name_bann:	.asciz	"Video adapter: "
-#endif /* CONFIG_VIDEO_SELECT */
-
-# Other variables:
-adapter:	.byte	0	# Video adapter: 0=CGA/MDA/HGA,1=EGA,2=VGA
-video_segment:	.word	0xb800	# Video memory segment
-force_size:	.word	0	# Use this size instead of the one in BIOS vars
diff --git a/arch/i386/kernel/verify_cpu.S b/arch/i386/kernel/verify_cpu.S
deleted file mode 100644
index fff4aa61d7c3..000000000000
--- a/arch/i386/kernel/verify_cpu.S
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Check if CPU has some minimum CPUID bits
-   This runs in 16bit mode so that the caller can still use the BIOS
-   to output errors on the screen */
-#include <asm/cpufeature.h>
-#include <asm/msr.h>
-
-verify_cpu:
-	pushfl				# Save caller passed flags
-	pushl	$0			# Kill any dangerous flags
-	popfl
-
-#if CONFIG_X86_MINIMUM_CPU_FAMILY >= 4
-	pushfl
-	pop	%eax
-	orl	$(1<<18),%eax		# try setting AC
-	push	%eax
-	popfl
-	pushfl
-	popl    %eax
-	testl	$(1<<18),%eax
-	jz	bad
-#endif
-#if REQUIRED_MASK0 != 0
-	pushfl				# standard way to check for cpuid
-	popl	%eax
-	movl	%eax,%ebx
-	xorl	$0x200000,%eax
-	pushl	%eax
-	popfl
-	pushfl
-	popl	%eax
-	cmpl	%eax,%ebx
-	pushfl				# standard way to check for cpuid
-	popl	%eax
-	movl	%eax,%ebx
-	xorl	$0x200000,%eax
-	pushl	%eax
-	popfl
-	pushfl
-	popl	%eax
-	cmpl	%eax,%ebx
-	jz	bad			# REQUIRED_MASK0 != 0 requires CPUID
-
-	movl	$0x0,%eax		# See if cpuid 1 is implemented
-	cpuid
-	cmpl	$0x1,%eax
-	jb	bad			# no cpuid 1
-
-#if REQUIRED_MASK0 & NEED_CMPXCHG64
-	/* Some VIA C3s need magic MSRs to enable CX64. Do this here */
-	cmpl	$0x746e6543,%ebx	# Cent
-	jne	1f
-	cmpl 	$0x48727561,%edx	# aurH
-	jne	1f
-	cmpl	$0x736c7561,%ecx	# auls
-	jne	1f
-	movl	$1,%eax			# check model
-	cpuid
-	movl	%eax,%ebx
-	shr	$8,%ebx
-	andl	$0xf,%ebx
-	cmp	$6,%ebx			# check family == 6
-	jne	1f
-	shr	$4,%eax
-	andl	$0xf,%eax
-	cmpl	$6,%eax			# check model >= 6
-	jb	1f
-	# assume models >= 6 all support this MSR
-	movl	$MSR_VIA_FCR,%ecx
-	rdmsr
-	orl	$((1<<1)|(1<<7)),%eax	# enable CMPXCHG64 and PGE
-	wrmsr
-1:
-#endif
-	movl    $0x1,%eax		# Does the cpu have what it takes
-	cpuid
-
-#if CONFIG_X86_MINIMUM_CPU_FAMILY > 4
-#error	add proper model checking here
-#endif
-
-	andl	$REQUIRED_MASK0,%edx
-	xorl	$REQUIRED_MASK0,%edx
-	jnz	bad
-#endif /* REQUIRED_MASK0 */
-
-	popfl
-	xor	%eax,%eax
-	ret
-
-bad:
-	popfl
-	movl	$1,%eax
-	ret
diff --git a/include/linux/edd.h b/include/linux/edd.h
index b2b3e68aa512..7b647822d6dc 100644
--- a/include/linux/edd.h
+++ b/include/linux/edd.h
@@ -49,10 +49,6 @@
 #define EDD_MBR_SIG_MAX 16        /* max number of signatures to store */
 #define EDD_MBR_SIG_NR_BUF 0x1ea  /* addr of number of MBR signtaures at EDD_MBR_SIG_BUF
 				     in boot_params - treat this as 1 byte  */
-#define EDD_CL_EQUALS   0x3d646465     /* "edd=" */
-#define EDD_CL_OFF      0x666f         /* "of" for off  */
-#define EDD_CL_SKIP     0x6b73         /* "sk" for skipmbr */
-#define EDD_CL_ON       0x6e6f	       /* "on" for on */
 
 #ifndef __ASSEMBLY__
 
-- 
cgit v1.3-8-gc7d7


From 5628221caf88e2a052782b042e12da7cd34111b0 Mon Sep 17 00:00:00 2001
From: Daniel Drake <dsd@gentoo.org>
Date: Tue, 10 Jul 2007 19:32:10 +0200
Subject: [PATCH] mac80211: ERP IE handling improvements

The "protection needed" flag is currently parsed out of the ERP IE in
beacons. This patch allows the ERP IE to be available at assocation time
and causes the appropriate actions to be performed earlier.

It is slightly complicated by the fact that most APs don't include the
ERP IE in association responses. To work around this, we store ERP
values in the ieee80211_sta_bss structure.

Also added some WLAN_ERP defines for use by upcoming patches.

Signed-off-by: Jiri Benc <jbenc@suse.cz>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h    | 11 +++++++
 net/mac80211/ieee80211_i.h   |  6 ++++
 net/mac80211/ieee80211_sta.c | 69 ++++++++++++++++++++++++++++++++------------
 3 files changed, 67 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index ecd61e8438a5..272f8c8c90da 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -227,6 +227,17 @@ struct ieee80211_cts {
 #define WLAN_CAPABILITY_SHORT_SLOT_TIME	(1<<10)
 #define WLAN_CAPABILITY_DSSS_OFDM	(1<<13)
 
+/* 802.11g ERP information element */
+#define WLAN_ERP_NON_ERP_PRESENT (1<<0)
+#define WLAN_ERP_USE_PROTECTION (1<<1)
+#define WLAN_ERP_BARKER_PREAMBLE (1<<2)
+
+/* WLAN_ERP_BARKER_PREAMBLE values */
+enum {
+	WLAN_ERP_PREAMBLE_SHORT = 0,
+	WLAN_ERP_PREAMBLE_LONG = 1,
+};
+
 /* Status codes */
 enum ieee80211_statuscode {
 	WLAN_STATUS_SUCCESS = 0,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index b222a9afd4e9..99ff7c5e9204 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -99,6 +99,12 @@ struct ieee80211_sta_bss {
 	int probe_resp;
 	unsigned long last_update;
 
+	/* during assocation, we save an ERP value from a probe response so
+	 * that we can feed ERP info to the driver when handling the
+	 * association completes. these fields probably won't be up-to-date
+	 * otherwise, you probably don't want to use them. */
+	int has_erp_value;
+	u8 erp_value;
 };
 
 
diff --git a/net/mac80211/ieee80211_sta.c b/net/mac80211/ieee80211_sta.c
index f3ca83743b48..df6c410de161 100644
--- a/net/mac80211/ieee80211_sta.c
+++ b/net/mac80211/ieee80211_sta.c
@@ -314,6 +314,27 @@ static void ieee80211_sta_wmm_params(struct net_device *dev,
 }
 
 
+static void ieee80211_handle_erp_ie(struct net_device *dev, u8 erp_value)
+{
+	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	int use_protection = (erp_value & WLAN_ERP_USE_PROTECTION) != 0;
+
+	if (use_protection != !!ifsta->use_protection) {
+		if (net_ratelimit()) {
+			printk(KERN_DEBUG "%s: CTS protection %s (BSSID="
+			       MAC_FMT ")\n",
+			       dev->name,
+			       use_protection ? "enabled" : "disabled",
+			       MAC_ARG(ifsta->bssid));
+		}
+		ifsta->use_protection = use_protection ? 1 : 0;
+		local->cts_protect_erp_frames = use_protection;
+	}
+}
+
+
 static void ieee80211_sta_send_associnfo(struct net_device *dev,
 					 struct ieee80211_if_sta *ifsta)
 {
@@ -377,9 +398,18 @@ static void ieee80211_set_associated(struct net_device *dev,
 
 	if (assoc) {
 		struct ieee80211_sub_if_data *sdata;
+		struct ieee80211_sta_bss *bss;
 		sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 		if (sdata->type != IEEE80211_IF_TYPE_STA)
 			return;
+
+		bss = ieee80211_rx_bss_get(dev, ifsta->bssid);
+		if (bss) {
+			if (bss->has_erp_value)
+				ieee80211_handle_erp_ie(dev, bss->erp_value);
+			ieee80211_rx_bss_put(dev, bss);
+		}
+
 		netif_carrier_on(dev);
 		ifsta->prev_bssid_set = 1;
 		memcpy(ifsta->prev_bssid, sdata->u.sta.bssid, ETH_ALEN);
@@ -1177,6 +1207,18 @@ static void ieee80211_rx_mgmt_assoc_resp(struct net_device *dev,
 		return;
 	}
 
+	/* it probably doesn't, but if the frame includes an ERP value then
+	 * update our stored copy */
+	if (elems.erp_info && elems.erp_info_len >= 1) {
+		struct ieee80211_sta_bss *bss
+			= ieee80211_rx_bss_get(dev, ifsta->bssid);
+		if (bss) {
+			bss->erp_value = elems.erp_info[0];
+			bss->has_erp_value = 1;
+			ieee80211_rx_bss_put(dev, bss);
+		}
+	}
+
 	printk(KERN_DEBUG "%s: associated\n", dev->name);
 	ifsta->aid = aid;
 	ifsta->ap_capab = capab_info;
@@ -1499,6 +1541,12 @@ static void ieee80211_rx_bss_info(struct net_device *dev,
 		return;
 	}
 
+	/* save the ERP value so that it is available at association time */
+	if (elems.erp_info && elems.erp_info_len >= 1) {
+		bss->erp_value = elems.erp_info[0];
+		bss->has_erp_value = 1;
+	}
+
 	bss->beacon_int = le16_to_cpu(mgmt->u.beacon.beacon_int);
 	bss->capability = le16_to_cpu(mgmt->u.beacon.capab_info);
 	if (elems.ssid && elems.ssid_len <= IEEE80211_MAX_SSID_LEN) {
@@ -1614,10 +1662,8 @@ static void ieee80211_rx_mgmt_beacon(struct net_device *dev,
 				     size_t len,
 				     struct ieee80211_rx_status *rx_status)
 {
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 	struct ieee80211_sub_if_data *sdata;
 	struct ieee80211_if_sta *ifsta;
-	int use_protection;
 	size_t baselen;
 	struct ieee802_11_elems elems;
 
@@ -1641,23 +1687,8 @@ static void ieee80211_rx_mgmt_beacon(struct net_device *dev,
 				   &elems) == ParseFailed)
 		return;
 
-	use_protection = 0;
-	if (elems.erp_info && elems.erp_info_len >= 1) {
-		use_protection =
-			(elems.erp_info[0] & ERP_INFO_USE_PROTECTION) != 0;
-	}
-
-	if (use_protection != !!ifsta->use_protection) {
-		if (net_ratelimit()) {
-			printk(KERN_DEBUG "%s: CTS protection %s (BSSID="
-			       MAC_FMT ")\n",
-			       dev->name,
-			       use_protection ? "enabled" : "disabled",
-			       MAC_ARG(ifsta->bssid));
-		}
-		ifsta->use_protection = use_protection ? 1 : 0;
-		local->cts_protect_erp_frames = use_protection;
-	}
+	if (elems.erp_info && elems.erp_info_len >= 1)
+		ieee80211_handle_erp_ie(dev, elems.erp_info[0]);
 
 	if (elems.wmm_param && ifsta->wmm_enabled) {
 		ieee80211_sta_wmm_params(dev, ifsta, elems.wmm_param,
-- 
cgit v1.3-8-gc7d7


From 6a775e2ba4f7635849ade628e64723ab2beef0bc Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Thu, 21 Jun 2007 12:27:47 +0300
Subject: IB/mlx4: Implement query QP

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mlx4/main.c    |   2 +
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   2 +
 drivers/infiniband/hw/mlx4/qp.c      | 137 +++++++++++++++++++++++++++++++++++
 drivers/net/mlx4/qp.c                |  21 ++++++
 include/linux/mlx4/qp.h              |   3 +
 5 files changed, 165 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 2fc8ccebaac1..6b9870a50bea 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -523,6 +523,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
 		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
 		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
 		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
 		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
 		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
@@ -550,6 +551,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->ib_dev.post_srq_recv	= mlx4_ib_post_srq_recv;
 	ibdev->ib_dev.create_qp		= mlx4_ib_create_qp;
 	ibdev->ib_dev.modify_qp		= mlx4_ib_modify_qp;
+	ibdev->ib_dev.query_qp		= mlx4_ib_query_qp;
 	ibdev->ib_dev.destroy_qp	= mlx4_ib_destroy_qp;
 	ibdev->ib_dev.post_send		= mlx4_ib_post_send;
 	ibdev->ib_dev.post_recv		= mlx4_ib_post_recv;
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 40b83914b7b2..d6dc57c5ccca 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -267,6 +267,8 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 int mlx4_ib_destroy_qp(struct ib_qp *qp);
 int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		      int attr_mask, struct ib_udata *udata);
+int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		     struct ib_qp_init_attr *qp_init_attr);
 int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 		      struct ib_send_wr **bad_wr);
 int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 28a08bdd1800..40042184ad58 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1455,3 +1455,140 @@ out:
 
 	return err;
 }
+
+static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state)
+{
+	switch (mlx4_state) {
+	case MLX4_QP_STATE_RST:      return IB_QPS_RESET;
+	case MLX4_QP_STATE_INIT:     return IB_QPS_INIT;
+	case MLX4_QP_STATE_RTR:      return IB_QPS_RTR;
+	case MLX4_QP_STATE_RTS:      return IB_QPS_RTS;
+	case MLX4_QP_STATE_SQ_DRAINING:
+	case MLX4_QP_STATE_SQD:      return IB_QPS_SQD;
+	case MLX4_QP_STATE_SQER:     return IB_QPS_SQE;
+	case MLX4_QP_STATE_ERR:      return IB_QPS_ERR;
+	default:		     return -1;
+	}
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state)
+{
+	switch (mlx4_mig_state) {
+	case MLX4_QP_PM_ARMED:		return IB_MIG_ARMED;
+	case MLX4_QP_PM_REARM:		return IB_MIG_REARM;
+	case MLX4_QP_PM_MIGRATED:	return IB_MIG_MIGRATED;
+	default: return -1;
+	}
+}
+
+static int to_ib_qp_access_flags(int mlx4_flags)
+{
+	int ib_flags = 0;
+
+	if (mlx4_flags & MLX4_QP_BIT_RRE)
+		ib_flags |= IB_ACCESS_REMOTE_READ;
+	if (mlx4_flags & MLX4_QP_BIT_RWE)
+		ib_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (mlx4_flags & MLX4_QP_BIT_RAE)
+		ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+	return ib_flags;
+}
+
+static void to_ib_ah_attr(struct mlx4_dev *dev, struct ib_ah_attr *ib_ah_attr,
+				struct mlx4_qp_path *path)
+{
+	memset(ib_ah_attr, 0, sizeof *path);
+	ib_ah_attr->port_num	  = path->sched_queue & 0x40 ? 2 : 1;
+
+	if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports)
+		return;
+
+	ib_ah_attr->dlid	  = be16_to_cpu(path->rlid);
+	ib_ah_attr->sl		  = (path->sched_queue >> 2) & 0xf;
+	ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f;
+	ib_ah_attr->static_rate   = path->static_rate ? path->static_rate - 5 : 0;
+	ib_ah_attr->ah_flags      = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0;
+	if (ib_ah_attr->ah_flags) {
+		ib_ah_attr->grh.sgid_index = path->mgid_index;
+		ib_ah_attr->grh.hop_limit  = path->hop_limit;
+		ib_ah_attr->grh.traffic_class =
+			(be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff;
+		ib_ah_attr->grh.flow_label =
+			be32_to_cpu(path->tclass_flowlabel) & 0xffffff;
+		memcpy(ib_ah_attr->grh.dgid.raw,
+			path->rgid, sizeof ib_ah_attr->grh.dgid.raw);
+	}
+}
+
+int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		     struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct mlx4_qp_context context;
+	int mlx4_state;
+	int err;
+
+	if (qp->state == IB_QPS_RESET) {
+		qp_attr->qp_state = IB_QPS_RESET;
+		goto done;
+	}
+
+	err = mlx4_qp_query(dev->dev, &qp->mqp, &context);
+	if (err)
+		return -EINVAL;
+
+	mlx4_state = be32_to_cpu(context.flags) >> 28;
+
+	qp_attr->qp_state	     = to_ib_qp_state(mlx4_state);
+	qp_attr->path_mtu	     = context.mtu_msgmax >> 5;
+	qp_attr->path_mig_state	     =
+		to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3);
+	qp_attr->qkey		     = be32_to_cpu(context.qkey);
+	qp_attr->rq_psn		     = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff;
+	qp_attr->sq_psn		     = be32_to_cpu(context.next_send_psn) & 0xffffff;
+	qp_attr->dest_qp_num	     = be32_to_cpu(context.remote_qpn) & 0xffffff;
+	qp_attr->qp_access_flags     =
+		to_ib_qp_access_flags(be32_to_cpu(context.params2));
+
+	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) {
+		to_ib_ah_attr(dev->dev, &qp_attr->ah_attr, &context.pri_path);
+		to_ib_ah_attr(dev->dev, &qp_attr->alt_ah_attr, &context.alt_path);
+		qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f;
+		qp_attr->alt_port_num	= qp_attr->alt_ah_attr.port_num;
+	}
+
+	qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f;
+	qp_attr->port_num   = context.pri_path.sched_queue & 0x40 ? 2 : 1;
+
+	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+	qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING;
+
+	qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7);
+
+	qp_attr->max_dest_rd_atomic =
+		1 << ((be32_to_cpu(context.params2) >> 21) & 0x7);
+	qp_attr->min_rnr_timer	    =
+		(be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f;
+	qp_attr->timeout	    = context.pri_path.ackto >> 3;
+	qp_attr->retry_cnt	    = (be32_to_cpu(context.params1) >> 16) & 0x7;
+	qp_attr->rnr_retry	    = (be32_to_cpu(context.params1) >> 13) & 0x7;
+	qp_attr->alt_timeout	    = context.alt_path.ackto >> 3;
+
+done:
+	qp_attr->cur_qp_state	     = qp_attr->qp_state;
+	if (!ibqp->uobject) {
+		qp_attr->cap.max_send_wr     = qp->sq.wqe_cnt;
+		qp_attr->cap.max_recv_wr     = qp->rq.wqe_cnt;
+		qp_attr->cap.max_send_sge    = qp->sq.max_gs;
+		qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
+		qp_attr->cap.max_inline_data = (1 << qp->sq.wqe_shift) -
+			send_wqe_overhead(qp->ibqp.qp_type) -
+			sizeof (struct mlx4_wqe_inline_seg);
+		qp_init_attr->cap	     = qp_attr->cap;
+	}
+
+	return 0;
+}
+
diff --git a/drivers/net/mlx4/qp.c b/drivers/net/mlx4/qp.c
index 492cfaaaa75c..19b48c71cf7f 100644
--- a/drivers/net/mlx4/qp.c
+++ b/drivers/net/mlx4/qp.c
@@ -277,3 +277,24 @@ void mlx4_cleanup_qp_table(struct mlx4_dev *dev)
 	mlx4_CONF_SPECIAL_QP(dev, 0);
 	mlx4_bitmap_cleanup(&mlx4_priv(dev)->qp_table.bitmap);
 }
+
+int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp,
+		  struct mlx4_qp_context *context)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, qp->qpn, 0,
+			   MLX4_CMD_QUERY_QP, MLX4_CMD_TIME_CLASS_A);
+	if (!err)
+		memcpy(context, mailbox->buf + 8, sizeof *context);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_query);
+
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index 10c57d279144..3968b943259a 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -282,6 +282,9 @@ int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 		   struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar,
 		   int sqd_event, struct mlx4_qp *qp);
 
+int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp,
+		  struct mlx4_qp_context *context);
+
 static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn)
 {
 	return radix_tree_lookup(&dev->qp_table_tree, qpn & (dev->caps.num_qps - 1));
-- 
cgit v1.3-8-gc7d7


From 65541cb7cf353946ecd78016a453b453b8830656 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Thu, 21 Jun 2007 13:03:11 +0300
Subject: IB/mlx4: Implement query SRQ

Signed-off-by: Dotan Barak <dotanb@mellanox.co.il>
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mlx4/main.c    |  2 ++
 drivers/infiniband/hw/mlx4/mlx4_ib.h |  1 +
 drivers/infiniband/hw/mlx4/srq.c     | 18 ++++++++++++++++++
 drivers/net/mlx4/srq.c               | 30 ++++++++++++++++++++++++++++++
 include/linux/mlx4/device.h          |  1 +
 5 files changed, 52 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 6b9870a50bea..dde8fe9af47e 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -529,6 +529,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
 		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
 		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
 		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
 
 	ibdev->ib_dev.query_device	= mlx4_ib_query_device;
@@ -547,6 +548,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->ib_dev.destroy_ah	= mlx4_ib_destroy_ah;
 	ibdev->ib_dev.create_srq	= mlx4_ib_create_srq;
 	ibdev->ib_dev.modify_srq	= mlx4_ib_modify_srq;
+	ibdev->ib_dev.query_srq		= mlx4_ib_query_srq;
 	ibdev->ib_dev.destroy_srq	= mlx4_ib_destroy_srq;
 	ibdev->ib_dev.post_srq_recv	= mlx4_ib_post_srq_recv;
 	ibdev->ib_dev.create_qp		= mlx4_ib_create_qp;
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index d6dc57c5ccca..705ff2fa237e 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -256,6 +256,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
 				  struct ib_udata *udata);
 int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
 int mlx4_ib_destroy_srq(struct ib_srq *srq);
 void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index);
 int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index 12fac1c8989d..408748fb5285 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -240,6 +240,24 @@ int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 	return 0;
 }
 
+int mlx4_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+	int ret;
+	int limit_watermark;
+
+	ret = mlx4_srq_query(dev->dev, &srq->msrq, &limit_watermark);
+	if (ret)
+		return ret;
+
+	srq_attr->srq_limit = be16_to_cpu(limit_watermark);
+	srq_attr->max_wr    = srq->msrq.max - 1;
+	srq_attr->max_sge   = srq->msrq.max_gs;
+
+	return 0;
+}
+
 int mlx4_ib_destroy_srq(struct ib_srq *srq)
 {
 	struct mlx4_ib_dev *dev = to_mdev(srq->device);
diff --git a/drivers/net/mlx4/srq.c b/drivers/net/mlx4/srq.c
index 2134f83aed87..b061c86d6839 100644
--- a/drivers/net/mlx4/srq.c
+++ b/drivers/net/mlx4/srq.c
@@ -102,6 +102,13 @@ static int mlx4_ARM_SRQ(struct mlx4_dev *dev, int srq_num, int limit_watermark)
 			MLX4_CMD_TIME_CLASS_B);
 }
 
+static int mlx4_QUERY_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int srq_num)
+{
+	return mlx4_cmd_box(dev, 0, mailbox->dma, srq_num, 0, MLX4_CMD_QUERY_SRQ,
+			    MLX4_CMD_TIME_CLASS_A);
+}
+
 int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, struct mlx4_mtt *mtt,
 		   u64 db_rec, struct mlx4_srq *srq)
 {
@@ -205,6 +212,29 @@ int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark
 }
 EXPORT_SYMBOL_GPL(mlx4_srq_arm);
 
+int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_watermark)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_srq_context *srq_context;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	srq_context = mailbox->buf;
+
+	err = mlx4_QUERY_SRQ(dev, mailbox, srq->srqn);
+	if (err)
+		goto err_out;
+	*limit_watermark = srq_context->limit_watermark;
+
+err_out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_query);
+
 int __devinit mlx4_init_srq_table(struct mlx4_dev *dev)
 {
 	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 8209387ee854..cfb78fb2c046 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -323,6 +323,7 @@ int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, struct mlx4_mtt *mtt,
 		   u64 db_rec, struct mlx4_srq *srq);
 void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq);
 int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark);
+int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_watermark);
 
 int mlx4_INIT_PORT(struct mlx4_dev *dev, int port);
 int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port);
-- 
cgit v1.3-8-gc7d7


From ec22559e0b7a05283a3413bda5d177e42c950e23 Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oneukum@suse.de>
Date: Fri, 27 Apr 2007 20:54:57 +0200
Subject: USB: suspend support for usb serial

this implements generic support for suspend/resume for usb serial.

Signed-off-by: Oliver Neukum <oneukum@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/serial/generic.c    | 18 ++++++++++++++++++
 drivers/usb/serial/usb-serial.c | 31 +++++++++++++++++++++++++++++++
 include/linux/usb/serial.h      |  7 +++++++
 3 files changed, 56 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index 4f8282ad7720..b90ef3f70f4c 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -69,6 +69,7 @@ struct usb_serial_driver usb_serial_generic_device = {
 	.shutdown =		usb_serial_generic_shutdown,
 	.throttle =		usb_serial_generic_throttle,
 	.unthrottle =		usb_serial_generic_unthrottle,
+	.resume =		usb_serial_generic_resume,
 };
 
 static int generic_probe(struct usb_interface *interface,
@@ -169,6 +170,23 @@ static void generic_cleanup (struct usb_serial_port *port)
 	}
 }
 
+int usb_serial_generic_resume(struct usb_serial *serial)
+{
+	struct usb_serial_port *port;
+	int i, c = 0, r;
+
+	for (i = 0; i < serial->num_ports; i++) {
+		port = serial->port[i];
+		if (port->open_count && port->read_urb) {
+			r = usb_submit_urb(port->read_urb, GFP_NOIO);
+			if (r < 0)
+				c++;
+		}
+	}
+
+	return c ? -EIO : 0;
+}
+
 void usb_serial_generic_close (struct usb_serial_port *port, struct file * filp)
 {
 	dbg("%s - port %d", __FUNCTION__, port->number);
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 87f378806db6..e3e3728e16e3 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -46,6 +46,8 @@ static struct usb_driver usb_serial_driver = {
 	.name =		"usbserial",
 	.probe =	usb_serial_probe,
 	.disconnect =	usb_serial_disconnect,
+	.suspend =	usb_serial_suspend,
+	.resume =	usb_serial_resume,
 	.no_dynamic_id = 	1,
 };
 
@@ -1069,6 +1071,35 @@ void usb_serial_disconnect(struct usb_interface *interface)
 	dev_info(dev, "device disconnected\n");
 }
 
+int usb_serial_suspend(struct usb_interface *intf, pm_message_t message)
+{
+	struct usb_serial *serial = usb_get_intfdata(intf);
+	struct usb_serial_port *port;
+	int i, r = 0;
+
+	if (serial) {
+		for (i = 0; i < serial->num_ports; ++i) {
+			port = serial->port[i];
+			if (port)
+				kill_traffic(port);
+		}
+	}
+
+	if (serial->type->suspend)
+		serial->type->suspend(serial, message);
+
+	return r;
+}
+EXPORT_SYMBOL(usb_serial_suspend);
+
+int usb_serial_resume(struct usb_interface *intf)
+{
+	struct usb_serial *serial = usb_get_intfdata(intf);
+
+	return serial->type->resume(serial);
+}
+EXPORT_SYMBOL(usb_serial_resume);
+
 static const struct tty_operations serial_ops = {
 	.open =			serial_open,
 	.close =		serial_close,
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 32acbae28d24..e8b8928232c8 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -221,6 +221,9 @@ struct usb_serial_driver {
 	int (*port_probe) (struct usb_serial_port *port);
 	int (*port_remove) (struct usb_serial_port *port);
 
+	int (*suspend) (struct usb_serial *serial, pm_message_t message);
+	int (*resume) (struct usb_serial *serial);
+
 	/* serial function calls */
 	int  (*open)		(struct usb_serial_port *port, struct file * filp);
 	void (*close)		(struct usb_serial_port *port, struct file * filp);
@@ -249,6 +252,9 @@ extern void usb_serial_port_softint(struct usb_serial_port *port);
 extern int usb_serial_probe(struct usb_interface *iface, const struct usb_device_id *id);
 extern void usb_serial_disconnect(struct usb_interface *iface);
 
+extern int usb_serial_suspend(struct usb_interface *intf, pm_message_t message);
+extern int usb_serial_resume(struct usb_interface *intf);
+
 extern int ezusb_writememory (struct usb_serial *serial, int address, unsigned char *data, int length, __u8 bRequest);
 extern int ezusb_set_reset (struct usb_serial *serial, unsigned char reset_bit);
 
@@ -269,6 +275,7 @@ extern void usb_serial_put(struct usb_serial *serial);
 extern int usb_serial_generic_open (struct usb_serial_port *port, struct file *filp);
 extern int usb_serial_generic_write (struct usb_serial_port *port, const unsigned char *buf, int count);
 extern void usb_serial_generic_close (struct usb_serial_port *port, struct file *filp);
+extern int usb_serial_generic_resume (struct usb_serial *serial);
 extern int usb_serial_generic_write_room (struct usb_serial_port *port);
 extern int usb_serial_generic_chars_in_buffer (struct usb_serial_port *port);
 extern void usb_serial_generic_read_bulk_callback (struct urb *urb);
-- 
cgit v1.3-8-gc7d7


From 0458d5b4c9cc4ca0f62625d0144ddc4b4bc97a3c Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Fri, 4 May 2007 11:52:20 -0400
Subject: USB: add USB-Persist facility

This patch (as886) adds the controversial USB-persist facility,
allowing USB devices to persist across a power loss during system
suspend.

The facility is controlled by a new Kconfig option (with appropriate
warnings about the potential dangers); when the option is off the
behavior will remain the same as it is now.  But when the option is
on, people will be able to use suspend-to-disk and keep their USB
filesystems intact -- something particularly valuable for small
machines where the root filesystem is on a USB device!

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/power/swsusp.txt |   3 +
 Documentation/usb/persist.txt  | 144 ++++++++++++++++++++++++++++++
 drivers/hid/usbhid/hid-core.c  |   2 +-
 drivers/usb/core/Kconfig       |  22 +++++
 drivers/usb/core/driver.c      |  39 ++++----
 drivers/usb/core/generic.c     |   5 +-
 drivers/usb/core/hub.c         | 196 +++++++++++++++++++++++++++++------------
 drivers/usb/core/usb.h         |   1 +
 drivers/usb/storage/usb.c      |   8 +-
 include/linux/usb.h            |   8 +-
 10 files changed, 351 insertions(+), 77 deletions(-)
 create mode 100644 Documentation/usb/persist.txt

(limited to 'include/linux')

diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
index 5b8d6953f05e..152b510d1bbb 100644
--- a/Documentation/power/swsusp.txt
+++ b/Documentation/power/swsusp.txt
@@ -393,6 +393,9 @@ safest thing is to unmount all filesystems on removable media (such USB,
 Firewire, CompactFlash, MMC, external SATA, or even IDE hotplug bays)
 before suspending; then remount them after resuming.
 
+There is a work-around for this problem.  For more information, see
+Documentation/usb/persist.txt.
+
 Q: I upgraded the kernel from 2.6.15 to 2.6.16. Both kernels were
 compiled with the similar configuration files. Anyway I found that
 suspend to disk (and resume) is much slower on 2.6.16 compared to
diff --git a/Documentation/usb/persist.txt b/Documentation/usb/persist.txt
new file mode 100644
index 000000000000..6dcd5f884795
--- /dev/null
+++ b/Documentation/usb/persist.txt
@@ -0,0 +1,144 @@
+		USB device persistence during system suspend
+
+		   Alan Stern <stern@rowland.harvard.edu>
+
+		 September 2, 2006 (Updated March 27, 2007)
+
+
+	What is the problem?
+
+According to the USB specification, when a USB bus is suspended the
+bus must continue to supply suspend current (around 1-5 mA).  This
+is so that devices can maintain their internal state and hubs can
+detect connect-change events (devices being plugged in or unplugged).
+The technical term is "power session".
+
+If a USB device's power session is interrupted then the system is
+required to behave as though the device has been unplugged.  It's a
+conservative approach; in the absence of suspend current the computer
+has no way to know what has actually happened.  Perhaps the same
+device is still attached or perhaps it was removed and a different
+device plugged into the port.  The system must assume the worst.
+
+By default, Linux behaves according to the spec.  If a USB host
+controller loses power during a system suspend, then when the system
+wakes up all the devices attached to that controller are treated as
+though they had disconnected.  This is always safe and it is the
+"officially correct" thing to do.
+
+For many sorts of devices this behavior doesn't matter in the least.
+If the kernel wants to believe that your USB keyboard was unplugged
+while the system was asleep and a new keyboard was plugged in when the
+system woke up, who cares?  It'll still work the same when you type on
+it.
+
+Unfortunately problems _can_ arise, particularly with mass-storage
+devices.  The effect is exactly the same as if the device really had
+been unplugged while the system was suspended.  If you had a mounted
+filesystem on the device, you're out of luck -- everything in that
+filesystem is now inaccessible.  This is especially annoying if your
+root filesystem was located on the device, since your system will
+instantly crash.
+
+Loss of power isn't the only mechanism to worry about.  Anything that
+interrupts a power session will have the same effect.  For example,
+even though suspend current may have been maintained while the system
+was asleep, on many systems during the initial stages of wakeup the
+firmware (i.e., the BIOS) resets the motherboard's USB host
+controllers.  Result: all the power sessions are destroyed and again
+it's as though you had unplugged all the USB devices.  Yes, it's
+entirely the BIOS's fault, but that doesn't do _you_ any good unless
+you can convince the BIOS supplier to fix the problem (lots of luck!).
+
+On many systems the USB host controllers will get reset after a
+suspend-to-RAM.  On almost all systems, no suspend current is
+available during suspend-to-disk (also known as swsusp).  You can
+check the kernel log after resuming to see if either of these has
+happened; look for lines saying "root hub lost power or was reset".
+
+In practice, people are forced to unmount any filesystems on a USB
+device before suspending.  If the root filesystem is on a USB device,
+the system can't be suspended at all.  (All right, it _can_ be
+suspended -- but it will crash as soon as it wakes up, which isn't
+much better.)
+
+
+	What is the solution?
+
+Setting CONFIG_USB_PERSIST will cause the kernel to work around these
+issues.  It enables a mode in which the core USB device data
+structures are allowed to persist across a power-session disruption.
+It works like this.  If the kernel sees that a USB host controller is
+not in the expected state during resume (i.e., if the controller was
+reset or otherwise had lost power) then it applies a persistence check
+to each of the USB devices below that controller.  It doesn't try to
+resume the device; that can't work once the power session is gone.
+Instead it issues a USB port reset and then re-enumerates the device.
+(This is exactly the same thing that happens whenever a USB device is
+reset.)  If the re-enumeration shows that the device now attached to
+that port has the same descriptors as before, including the Vendor and
+Product IDs, then the kernel continues to use the same device
+structure.  In effect, the kernel treats the device as though it had
+merely been reset instead of unplugged.
+
+If no device is now attached to the port, or if the descriptors are
+different from what the kernel remembers, then the treatment is what
+you would expect.  The kernel destroys the old device structure and
+behaves as though the old device had been unplugged and a new device
+plugged in, just as it would without the CONFIG_USB_PERSIST option.
+
+The end result is that the USB device remains available and usable.
+Filesystem mounts and memory mappings are unaffected, and the world is
+now a good and happy place.
+
+
+	Is this the best solution?
+
+Perhaps not.  Arguably, keeping track of mounted filesystems and
+memory mappings across device disconnects should be handled by a
+centralized Logical Volume Manager.  Such a solution would allow you
+to plug in a USB flash device, create a persistent volume associated
+with it, unplug the flash device, plug it back in later, and still
+have the same persistent volume associated with the device.  As such
+it would be more far-reaching than CONFIG_USB_PERSIST.
+
+On the other hand, writing a persistent volume manager would be a big
+job and using it would require significant input from the user.  This
+solution is much quicker and easier -- and it exists now, a giant
+point in its favor!
+
+Furthermore, the USB_PERSIST option applies to _all_ USB devices, not
+just mass-storage devices.  It might turn out to be equally useful for
+other device types, such as network interfaces.
+
+
+	WARNING: Using CONFIG_USB_PERSIST can be dangerous!!
+
+When recovering an interrupted power session the kernel does its best
+to make sure the USB device hasn't been changed; that is, the same
+device is still plugged into the port as before.  But the checks
+aren't guaranteed to be 100% accurate.
+
+If you replace one USB device with another of the same type (same
+manufacturer, same IDs, and so on) there's an excellent chance the
+kernel won't detect the change.  Serial numbers and other strings are
+not compared.  In many cases it wouldn't help if they were, because
+manufacturers frequently omit serial numbers entirely in their
+devices.
+
+Furthermore it's quite possible to leave a USB device exactly the same
+while changing its media.  If you replace the flash memory card in a
+USB card reader while the system is asleep, the kernel will have no
+way to know you did it.  The kernel will assume that nothing has
+happened and will continue to use the partition tables, inodes, and
+memory mappings for the old card.
+
+If the kernel gets fooled in this way, it's almost certain to cause
+data corruption and to crash your system.  You'll have no one to blame
+but yourself.
+
+YOU HAVE BEEN WARNED!  USE AT YOUR OWN RISK!
+
+That having been said, most of the time there shouldn't be any trouble
+at all.  The "persist" feature can be extremely useful.  Make the most
+of it.
diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index 3afa4a5035b7..e221b0d1f667 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -1015,7 +1015,7 @@ static void hid_pre_reset(struct usb_interface *intf)
 	hid_suspend(intf, PMSG_ON);
 }
 
-static void hid_post_reset(struct usb_interface *intf)
+static void hid_post_reset(struct usb_interface *intf, int reset_resume)
 {
 	struct usb_device *dev = interface_to_usbdev (intf);
 
diff --git a/drivers/usb/core/Kconfig b/drivers/usb/core/Kconfig
index 346fc030c929..5113ef4cb7f6 100644
--- a/drivers/usb/core/Kconfig
+++ b/drivers/usb/core/Kconfig
@@ -86,6 +86,28 @@ config USB_SUSPEND
 
 	  If you are unsure about this, say N here.
 
+config USB_PERSIST
+	bool "USB device persistence during system suspend (DANGEROUS)"
+	depends on USB && PM && EXPERIMENTAL
+	default n
+	help
+	  If you say Y here, USB device data structures will remain
+	  persistent across system suspend, even if the USB bus loses
+	  power.  (This includes software-suspend, also known as swsusp,
+	  or suspend-to-disk.)  The devices will reappear as if by magic
+	  when the system wakes up, with no need to unmount USB filesystems,
+	  rmmod host-controller drivers, or do anything else.
+
+	  	WARNING: This option can be dangerous!
+
+	  If a USB device is replaced by another of the same type while
+	  the system is asleep, there's a good chance the kernel won't
+	  detect the change.  Likewise if the media in a USB storage
+	  device is replaced.  When this happens it's almost certain to
+	  cause data corruption and maybe even crash your system.
+
+	  If you are unsure, say N here.
+
 config USB_OTG
 	bool
 	depends on USB && EXPERIMENTAL
diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index e8b447e06c54..12dd986bdffd 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -824,8 +824,9 @@ static int usb_resume_device(struct usb_device *udev)
 	struct usb_device_driver	*udriver;
 	int				status = 0;
 
-	if (udev->state == USB_STATE_NOTATTACHED ||
-			udev->state != USB_STATE_SUSPENDED)
+	if (udev->state == USB_STATE_NOTATTACHED)
+		goto done;
+	if (udev->state != USB_STATE_SUSPENDED && !udev->reset_resume)
 		goto done;
 
 	/* Can't resume it if it doesn't have a driver. */
@@ -882,7 +883,7 @@ done:
 }
 
 /* Caller has locked intf's usb_device's pm_mutex */
-static int usb_resume_interface(struct usb_interface *intf)
+static int usb_resume_interface(struct usb_interface *intf, int reset_resume)
 {
 	struct usb_driver	*driver;
 	int			status = 0;
@@ -902,21 +903,21 @@ static int usb_resume_interface(struct usb_interface *intf)
 	}
 	driver = to_usb_driver(intf->dev.driver);
 
-	if (driver->resume) {
+	if (reset_resume && driver->post_reset)
+		driver->post_reset(intf, reset_resume);
+	else if (driver->resume) {
 		status = driver->resume(intf);
 		if (status)
 			dev_err(&intf->dev, "%s error %d\n",
 					"resume", status);
-		else
-			mark_active(intf);
-	} else {
+	} else
 		dev_warn(&intf->dev, "no resume for driver %s?\n",
 				driver->name);
-		mark_active(intf);
-	}
 
 done:
 	// dev_dbg(&intf->dev, "%s: status %d\n", __FUNCTION__, status);
+	if (status == 0)
+		mark_active(intf);
 	return status;
 }
 
@@ -1063,7 +1064,7 @@ static int usb_suspend_both(struct usb_device *udev, pm_message_t msg)
 	if (status != 0) {
 		while (--i >= 0) {
 			intf = udev->actconfig->interface[i];
-			usb_resume_interface(intf);
+			usb_resume_interface(intf, 0);
 		}
 
 		/* Try another autosuspend when the interfaces aren't busy */
@@ -1162,20 +1163,21 @@ static int usb_resume_both(struct usb_device *udev)
  		}
 	} else {
 
-		/* Needed only for setting udev->dev.power.power_state.event
-		 * and for possible debugging message. */
+		/* Needed for setting udev->dev.power.power_state.event,
+		 * for possible debugging message, and for reset_resume. */
 		status = usb_resume_device(udev);
 	}
 
 	if (status == 0 && udev->actconfig) {
 		for (i = 0; i < udev->actconfig->desc.bNumInterfaces; i++) {
 			intf = udev->actconfig->interface[i];
-			usb_resume_interface(intf);
+			usb_resume_interface(intf, udev->reset_resume);
 		}
 	}
 
  done:
 	// dev_dbg(&udev->dev, "%s: status %d\n", __FUNCTION__, status);
+	udev->reset_resume = 0;
 	return status;
 }
 
@@ -1510,8 +1512,15 @@ static int usb_resume(struct device *dev)
 	if (!is_usb_device(dev))	/* Ignore PM for interfaces */
 		return 0;
 	udev = to_usb_device(dev);
-	if (udev->autoresume_disabled)
-		return -EPERM;
+
+	/* If autoresume is disabled then we also want to prevent resume
+	 * during system wakeup.  However, a "persistent-device" reset-resume
+	 * after power loss counts as a wakeup event.  So allow a
+	 * reset-resume to occur if remote wakeup is enabled. */
+	if (udev->autoresume_disabled) {
+		if (!(udev->reset_resume && udev->do_remote_wakeup))
+			return -EPERM;
+	}
 	return usb_external_resume_device(udev);
 }
 
diff --git a/drivers/usb/core/generic.c b/drivers/usb/core/generic.c
index 7cbf992adccd..d363b0ea7345 100644
--- a/drivers/usb/core/generic.c
+++ b/drivers/usb/core/generic.c
@@ -217,7 +217,10 @@ static int generic_resume(struct usb_device *udev)
 {
 	int rc;
 
-	rc = usb_port_resume(udev);
+	if (udev->reset_resume)
+		rc = usb_reset_suspended_device(udev);
+	else
+		rc = usb_port_resume(udev);
 
 	/* Root hubs don't have upstream ports to resume or reset,
 	 * so the line above won't do much for them.  We have to
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 77a6627b18d2..51d2d304568b 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -553,45 +553,121 @@ static int hub_hub_status(struct usb_hub *hub,
 static int hub_port_disable(struct usb_hub *hub, int port1, int set_state)
 {
 	struct usb_device *hdev = hub->hdev;
-	int ret;
+	int ret = 0;
 
-	if (hdev->children[port1-1] && set_state) {
+	if (hdev->children[port1-1] && set_state)
 		usb_set_device_state(hdev->children[port1-1],
 				USB_STATE_NOTATTACHED);
-	}
-	ret = clear_port_feature(hdev, port1, USB_PORT_FEAT_ENABLE);
+	if (!hub->error)
+		ret = clear_port_feature(hdev, port1, USB_PORT_FEAT_ENABLE);
 	if (ret)
 		dev_err(hub->intfdev, "cannot disable port %d (err = %d)\n",
-			port1, ret);
-
+				port1, ret);
 	return ret;
 }
 
+/*
+ * Disable a port and mark a logical connnect-change event, so that some
+ * time later khubd will disconnect() any existing usb_device on the port
+ * and will re-enumerate if there actually is a device attached.
+ */
+static void hub_port_logical_disconnect(struct usb_hub *hub, int port1)
+{
+	dev_dbg(hub->intfdev, "logical disconnect on port %d\n", port1);
+	hub_port_disable(hub, port1, 1);
 
-/* caller has locked the hub device */
-static void hub_pre_reset(struct usb_interface *intf)
+	/* FIXME let caller ask to power down the port:
+	 *  - some devices won't enumerate without a VBUS power cycle
+	 *  - SRP saves power that way
+	 *  - ... new call, TBD ...
+	 * That's easy if this hub can switch power per-port, and
+	 * khubd reactivates the port later (timer, SRP, etc).
+	 * Powerdown must be optional, because of reset/DFU.
+	 */
+
+	set_bit(port1, hub->change_bits);
+ 	kick_khubd(hub);
+}
+
+static void disconnect_all_children(struct usb_hub *hub, int logical)
 {
-	struct usb_hub *hub = usb_get_intfdata(intf);
 	struct usb_device *hdev = hub->hdev;
 	int port1;
 
 	for (port1 = 1; port1 <= hdev->maxchild; ++port1) {
-		if (hdev->children[port1 - 1]) {
-			usb_disconnect(&hdev->children[port1 - 1]);
-			if (hub->error == 0)
-				hub_port_disable(hub, port1, 0);
+		if (hdev->children[port1-1]) {
+			if (logical)
+				hub_port_logical_disconnect(hub, port1);
+			else
+				usb_disconnect(&hdev->children[port1-1]);
+		}
+	}
+}
+
+#ifdef	CONFIG_USB_PERSIST
+
+#define USB_PERSIST	1
+
+/* For "persistent-device" resets we must mark the child devices for reset
+ * and turn off a possible connect-change status (so khubd won't disconnect
+ * them later).
+ */
+static void mark_children_for_reset_resume(struct usb_hub *hub)
+{
+	struct usb_device *hdev = hub->hdev;
+	int port1;
+
+	for (port1 = 1; port1 <= hdev->maxchild; ++port1) {
+		struct usb_device *child = hdev->children[port1-1];
+
+		if (child) {
+			child->reset_resume = 1;
+			clear_port_feature(hdev, port1,
+					USB_PORT_FEAT_C_CONNECTION);
 		}
 	}
+}
+
+#else
+
+#define USB_PERSIST	0
+
+static inline void mark_children_for_reset_resume(struct usb_hub *hub)
+{ }
+
+#endif	/* CONFIG_USB_PERSIST */
+
+/* caller has locked the hub device */
+static void hub_pre_reset(struct usb_interface *intf)
+{
+	struct usb_hub *hub = usb_get_intfdata(intf);
+
+	/* This routine doesn't run as part of a reset-resume, so it's safe
+	 * to disconnect all the drivers below the hub.
+	 */
+	disconnect_all_children(hub, 0);
 	hub_quiesce(hub);
 }
 
 /* caller has locked the hub device */
-static void hub_post_reset(struct usb_interface *intf)
+static void hub_post_reset(struct usb_interface *intf, int reset_resume)
 {
 	struct usb_hub *hub = usb_get_intfdata(intf);
 
-	hub_activate(hub);
 	hub_power_on(hub);
+	if (reset_resume) {
+		if (USB_PERSIST)
+			mark_children_for_reset_resume(hub);
+		else {
+			/* Reset-resume doesn't call pre_reset, so we have to
+			 * disconnect the children here.  But we may not lock
+			 * the child devices, so we have to do a "logical"
+			 * disconnect.
+			 */
+			disconnect_all_children(hub, 1);
+		}
+	}
+	hub_activate(hub);
 }
 
 
@@ -1053,33 +1129,64 @@ void usb_set_device_state(struct usb_device *udev,
 
 #ifdef	CONFIG_PM
 
+/**
+ * usb_reset_suspended_device - reset a suspended device instead of resuming it
+ * @udev: device to be reset instead of resumed
+ *
+ * If a host controller doesn't maintain VBUS suspend current during a
+ * system sleep or is reset when the system wakes up, all the USB
+ * power sessions below it will be broken.  This is especially troublesome
+ * for mass-storage devices containing mounted filesystems, since the
+ * device will appear to have disconnected and all the memory mappings
+ * to it will be lost.
+ *
+ * As an alternative, this routine attempts to recover power sessions for
+ * devices that are still present by resetting them instead of resuming
+ * them.  If all goes well, the devices will appear to persist across the
+ * the interruption of the power sessions.
+ *
+ * This facility is inherently dangerous.  Although usb_reset_device()
+ * makes every effort to insure that the same device is present after the
+ * reset as before, it cannot provide a 100% guarantee.  Furthermore it's
+ * quite possible for a device to remain unaltered but its media to be
+ * changed.  If the user replaces a flash memory card while the system is
+ * asleep, he will have only himself to blame when the filesystem on the
+ * new card is corrupted and the system crashes.
+ */
+int usb_reset_suspended_device(struct usb_device *udev)
+{
+	int rc = 0;
+
+	dev_dbg(&udev->dev, "usb %sresume\n", "reset-");
+
+	/* After we're done the device won't be suspended any more.
+	 * In addition, the reset won't work if udev->state is SUSPENDED.
+	 */
+	usb_set_device_state(udev, udev->actconfig
+			? USB_STATE_CONFIGURED
+			: USB_STATE_ADDRESS);
+
+	/* Root hubs don't need to be (and can't be) reset */
+	if (udev->parent)
+		rc = usb_reset_device(udev);
+	return rc;
+}
+
 /**
  * usb_root_hub_lost_power - called by HCD if the root hub lost Vbus power
  * @rhdev: struct usb_device for the root hub
  *
  * The USB host controller driver calls this function when its root hub
  * is resumed and Vbus power has been interrupted or the controller
- * has been reset.  The routine marks all the children of the root hub
- * as NOTATTACHED and marks logical connect-change events on their ports.
+ * has been reset.  The routine marks @rhdev as having lost power.  When
+ * the hub driver is resumed it will take notice; if CONFIG_USB_PERSIST
+ * is enabled then it will carry out power-session recovery, otherwise
+ * it will disconnect all the child devices.
  */
 void usb_root_hub_lost_power(struct usb_device *rhdev)
 {
-	struct usb_hub *hub;
-	int port1;
-	unsigned long flags;
-
 	dev_warn(&rhdev->dev, "root hub lost power or was reset\n");
-
-	spin_lock_irqsave(&device_state_lock, flags);
-	hub = hdev_to_hub(rhdev);
-	for (port1 = 1; port1 <= rhdev->maxchild; ++port1) {
-		if (rhdev->children[port1 - 1]) {
-			recursively_mark_NOTATTACHED(
-					rhdev->children[port1 - 1]);
-			set_bit(port1, hub->change_bits);
-		}
-	}
-	spin_unlock_irqrestore(&device_state_lock, flags);
+	rhdev->reset_resume = 1;
 }
 EXPORT_SYMBOL_GPL(usb_root_hub_lost_power);
 
@@ -1513,29 +1620,6 @@ static int hub_port_reset(struct usb_hub *hub, int port1,
 	return status;
 }
 
-/*
- * Disable a port and mark a logical connnect-change event, so that some
- * time later khubd will disconnect() any existing usb_device on the port
- * and will re-enumerate if there actually is a device attached.
- */
-static void hub_port_logical_disconnect(struct usb_hub *hub, int port1)
-{
-	dev_dbg(hub->intfdev, "logical disconnect on port %d\n", port1);
-	hub_port_disable(hub, port1, 1);
-
-	/* FIXME let caller ask to power down the port:
-	 *  - some devices won't enumerate without a VBUS power cycle
-	 *  - SRP saves power that way
-	 *  - ... new call, TBD ...
-	 * That's easy if this hub can switch power per-port, and
-	 * khubd reactivates the port later (timer, SRP, etc).
-	 * Powerdown must be optional, because of reset/DFU.
-	 */
-
-	set_bit(port1, hub->change_bits);
- 	kick_khubd(hub);
-}
-
 #ifdef	CONFIG_PM
 
 #ifdef	CONFIG_USB_SUSPEND
@@ -3018,7 +3102,7 @@ int usb_reset_composite_device(struct usb_device *udev,
 					cintf->dev.driver) {
 				drv = to_usb_driver(cintf->dev.driver);
 				if (drv->post_reset)
-					(drv->post_reset)(cintf);
+					(drv->post_reset)(cintf, 0);
 			}
 			if (cintf != iface)
 				up(&cintf->dev.sem);
diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h
index 6f361df374fc..1a4862886733 100644
--- a/drivers/usb/core/usb.h
+++ b/drivers/usb/core/usb.h
@@ -36,6 +36,7 @@ extern void usb_host_cleanup(void);
 extern void usb_autosuspend_work(struct work_struct *work);
 extern int usb_port_suspend(struct usb_device *dev);
 extern int usb_port_resume(struct usb_device *dev);
+extern int usb_reset_suspended_device(struct usb_device *udev);
 extern int usb_external_suspend_device(struct usb_device *udev,
 		pm_message_t msg);
 extern int usb_external_resume_device(struct usb_device *udev);
diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c
index df5dc186aef5..be4cd8fe4ce6 100644
--- a/drivers/usb/storage/usb.c
+++ b/drivers/usb/storage/usb.c
@@ -236,7 +236,7 @@ static void storage_pre_reset(struct usb_interface *iface)
 	mutex_lock(&us->dev_mutex);
 }
 
-static void storage_post_reset(struct usb_interface *iface)
+static void storage_post_reset(struct usb_interface *iface, int reset_resume)
 {
 	struct us_data *us = usb_get_intfdata(iface);
 
@@ -249,7 +249,11 @@ static void storage_post_reset(struct usb_interface *iface)
 
 	/* FIXME: Notify the subdrivers that they need to reinitialize
 	 * the device */
-	mutex_unlock(&us->dev_mutex);
+
+	/* If this is a reset-resume then the pre_reset routine wasn't
+	 * called, so we don't need to unlock the mutex. */
+	if (!reset_resume)
+		mutex_unlock(&us->dev_mutex);
 }
 
 /*
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 56aa2ee21f1b..3d63e0c2dd70 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -403,6 +403,7 @@ struct usb_device {
 
 	unsigned auto_pm:1;		/* autosuspend/resume in progress */
 	unsigned do_remote_wakeup:1;	/* remote wakeup should be enabled */
+	unsigned reset_resume:1;	/* needs reset instead of resume */
 	unsigned autosuspend_disabled:1; /* autosuspend and autoresume */
 	unsigned autoresume_disabled:1;  /*  disabled by the user */
 #endif
@@ -819,7 +820,10 @@ struct usbdrv_wrap {
  * @pre_reset: Called by usb_reset_composite_device() when the device
  *	is about to be reset.
  * @post_reset: Called by usb_reset_composite_device() after the device
- *	has been reset.
+ *	has been reset, or in lieu of @resume following a reset-resume
+ *	(i.e., the device is reset instead of being resumed, as might
+ *	happen if power was lost).  The second argument tells which is
+ *	the reason.
  * @id_table: USB drivers use ID table to support hotplugging.
  *	Export this with MODULE_DEVICE_TABLE(usb,...).  This must be set
  *	or your driver's probe function will never get called.
@@ -861,7 +865,7 @@ struct usb_driver {
 	int (*resume) (struct usb_interface *intf);
 
 	void (*pre_reset) (struct usb_interface *intf);
-	void (*post_reset) (struct usb_interface *intf);
+	void (*post_reset) (struct usb_interface *intf, int reset_resume);
 
 	const struct usb_device_id *id_table;
 
-- 
cgit v1.3-8-gc7d7


From 6bc6cff52e0c4c4c876b1b8a5750041da61ad42b Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Fri, 4 May 2007 11:53:03 -0400
Subject: USB: add RESET_RESUME device quirk

This patch (as888) adds a new USB device quirk for devices which are
unable to resume correctly.  By using the new code added for the
USB-persist facility, it is a simple matter to reset these devices
instead of resuming them.  To get things kicked off, a quirk entry is
added for the Philips PSC805.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/driver.c  | 4 ++++
 drivers/usb/core/hub.c     | 5 +++++
 drivers/usb/core/quirks.c  | 3 +++
 include/linux/usb/quirks.h | 3 +++
 4 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 12dd986bdffd..02d6db61c940 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -24,6 +24,7 @@
 
 #include <linux/device.h>
 #include <linux/usb.h>
+#include <linux/usb/quirks.h>
 #include <linux/workqueue.h>
 #include "hcd.h"
 #include "usb.h"
@@ -835,6 +836,9 @@ static int usb_resume_device(struct usb_device *udev)
 		goto done;
 	}
 
+	if (udev->quirks & USB_QUIRK_RESET_RESUME)
+		udev->reset_resume = 1;
+
 	udriver = to_usb_device_driver(udev->dev.driver);
 	status = udriver->resume(udev);
 
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 51d2d304568b..d37ad083d5ef 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2939,6 +2939,11 @@ static int config_descriptors_changed(struct usb_device *udev)
  * this from a driver probe() routine after downloading new firmware.
  * For calls that might not occur during probe(), drivers should lock
  * the device using usb_lock_device_for_reset().
+ *
+ * Locking exception: This routine may also be called from within an
+ * autoresume handler.  Such usage won't conflict with other tasks
+ * holding the device lock because these tasks should always call
+ * usb_autopm_resume_device(), thereby preventing any unwanted autoresume.
  */
 int usb_reset_device(struct usb_device *udev)
 {
diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index 739f520908aa..f37fa012f329 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -35,6 +35,9 @@ static const struct usb_device_id usb_quirk_list[] = {
 	/* Elsa MicroLink 56k (V.250) */
 	{ USB_DEVICE(0x05cc, 0x2267), .driver_info = USB_QUIRK_NO_AUTOSUSPEND },
 
+	/* Philips PSC805 audio device */
+	{ USB_DEVICE(0x0471, 0x0155), .driver_info = USB_QUIRK_RESET_RESUME },
+
 	{ }  /* terminating entry must be last */
 };
 
diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h
index 6bac8faacbc6..8da374caf582 100644
--- a/include/linux/usb/quirks.h
+++ b/include/linux/usb/quirks.h
@@ -9,3 +9,6 @@
 
 /* string descriptors must not be fetched using a 255-byte read */
 #define USB_QUIRK_STRING_FETCH_255	0x00000002
+
+/* device can't resume correctly so reset it instead */
+#define USB_QUIRK_RESET_RESUME		0x00000004
-- 
cgit v1.3-8-gc7d7


From 8538f96ae5aada1c04d69a993b20ad160b191d47 Mon Sep 17 00:00:00 2001
From: Daniel Drake <dsd@gentoo.org>
Date: Thu, 10 May 2007 00:32:24 +0100
Subject: USB: add USB_DEVICE_AND_INTERFACE_INFO for device matching

Recently, the USB device matching code stopped matching generic interface
matches against devices with vendor-specific device class values.

Some drivers now need to explicitly match USB device ID's (in addition to
generic interface info) to retain the same behaviour as before. This new macro,
suggested by Alan Stern, makes the explicit device/interface matching a little
simpler for those users.

Signed-off-by: Daniel Drake <dsd@gentoo.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 3d63e0c2dd70..98e0338664fb 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -771,6 +771,28 @@ static inline int usb_endpoint_is_isoc_out(const struct usb_endpoint_descriptor
 	.match_flags = USB_DEVICE_ID_MATCH_INT_INFO, .bInterfaceClass = (cl), \
 	.bInterfaceSubClass = (sc), .bInterfaceProtocol = (pr)
 
+/**
+ * USB_DEVICE_AND_INTERFACE_INFO - macro used to describe a specific usb device
+ * 		with a class of usb interfaces
+ * @vend: the 16 bit USB Vendor ID
+ * @prod: the 16 bit USB Product ID
+ * @cl: bInterfaceClass value
+ * @sc: bInterfaceSubClass value
+ * @pr: bInterfaceProtocol value
+ *
+ * This macro is used to create a struct usb_device_id that matches a
+ * specific device with a specific class of interfaces.
+ *
+ * This is especially useful when explicitly matching devices that have
+ * vendor specific bDeviceClass values, but standards-compliant interfaces.
+ */
+#define USB_DEVICE_AND_INTERFACE_INFO(vend,prod,cl,sc,pr) \
+	.match_flags = USB_DEVICE_ID_MATCH_INT_INFO \
+		| USB_DEVICE_ID_MATCH_DEVICE, \
+	.idVendor = (vend), .idProduct = (prod), \
+	.bInterfaceClass = (cl), \
+	.bInterfaceSubClass = (sc), .bInterfaceProtocol = (pr)
+
 /* ----------------------------------------------------------------------- */
 
 /* Stuff for dynamic usb ids */
-- 
cgit v1.3-8-gc7d7


From a5262dcfda9163ca1f8a64349a6f7ba640ac1dc2 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Mon, 14 May 2007 19:36:41 -0700
Subject: USB: export <linux/usb_gadgetfs> as <linux/usb/gadgetfs.h>

Make sure gadgetfs userspace interface is properly exported:

 - Move <linux/usb_gadgetfs.h> to <linux/usb/gadgetfs.h>;
 - Export it using Kbuild;
 - Add an #include guard;
 - Correct some internal documentation;
 - Update struct layout so it's the same on 32/64 bit kernels.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/gadget/inode.c   |  2 +-
 include/linux/usb/Kbuild     |  1 +
 include/linux/usb/gadgetfs.h | 81 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb_gadgetfs.h | 75 ----------------------------------------
 4 files changed, 83 insertions(+), 76 deletions(-)
 create mode 100644 include/linux/usb/gadgetfs.h
 delete mode 100644 include/linux/usb_gadgetfs.h

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index 23b439ab774c..f723e083c9d0 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -37,7 +37,7 @@
 #include <linux/device.h>
 #include <linux/moduleparam.h>
 
-#include <linux/usb_gadgetfs.h>
+#include <linux/usb/gadgetfs.h>
 #include <linux/usb_gadget.h>
 
 
diff --git a/include/linux/usb/Kbuild b/include/linux/usb/Kbuild
index 43f160cfe003..6ce42bf9f743 100644
--- a/include/linux/usb/Kbuild
+++ b/include/linux/usb/Kbuild
@@ -1,5 +1,6 @@
 unifdef-y += audio.h
 unifdef-y += cdc.h
 unifdef-y += ch9.h
+unifdef-y += gadgetfs.h
 unifdef-y += midi.h
 
diff --git a/include/linux/usb/gadgetfs.h b/include/linux/usb/gadgetfs.h
new file mode 100644
index 000000000000..e8654c338729
--- /dev/null
+++ b/include/linux/usb/gadgetfs.h
@@ -0,0 +1,81 @@
+#ifndef __LINUX_USB_GADGETFS_H
+#define __LINUX_USB_GADGETFS_H
+
+#include <asm/types.h>
+#include <asm/ioctl.h>
+
+#include <linux/usb/ch9.h>
+
+/*
+ * Filesystem based user-mode API to USB Gadget controller hardware
+ *
+ * Other than ep0 operations, most things are done by read() and write()
+ * on endpoint files found in one directory.  They are configured by
+ * writing descriptors, and then may be used for normal stream style
+ * i/o requests.  When ep0 is configured, the device can enumerate;
+ * when it's closed, the device disconnects from usb.  Operations on
+ * ep0 require ioctl() operations.
+ *
+ * Configuration and device descriptors get written to /dev/gadget/$CHIP,
+ * which may then be used to read usb_gadgetfs_event structs.  The driver
+ * may activate endpoints as it handles SET_CONFIGURATION setup events,
+ * or earlier; writing endpoint descriptors to /dev/gadget/$ENDPOINT
+ * then performing data transfers by reading or writing.
+ */
+
+/*
+ * Events are delivered on the ep0 file descriptor, when the user mode driver
+ * reads from this file descriptor after writing the descriptors.  Don't
+ * stop polling this descriptor.
+ */
+
+enum usb_gadgetfs_event_type {
+	GADGETFS_NOP = 0,
+
+	GADGETFS_CONNECT,
+	GADGETFS_DISCONNECT,
+	GADGETFS_SETUP,
+	GADGETFS_SUSPEND,
+	// and likely more !
+};
+
+/* NOTE:  this structure must stay the same size and layout on
+ * both 32-bit and 64-bit kernels.
+ */
+struct usb_gadgetfs_event {
+	union {
+		// NOP, DISCONNECT, SUSPEND: nothing
+		// ... some hardware can't report disconnection
+
+		// CONNECT: just the speed
+		enum usb_device_speed	speed;
+
+		// SETUP: packet; DATA phase i/o precedes next event
+		// (setup.bmRequestType & USB_DIR_IN) flags direction
+		// ... includes SET_CONFIGURATION, SET_INTERFACE
+		struct usb_ctrlrequest	setup;
+	} u;
+	enum usb_gadgetfs_event_type	type;
+};
+
+
+/* endpoint ioctls */
+
+/* IN transfers may be reported to the gadget driver as complete
+ *	when the fifo is loaded, before the host reads the data;
+ * OUT transfers may be reported to the host's "client" driver as
+ *	complete when they're sitting in the FIFO unread.
+ * THIS returns how many bytes are "unclaimed" in the endpoint fifo
+ * (needed for precise fault handling, when the hardware allows it)
+ */
+#define	GADGETFS_FIFO_STATUS	_IO('g',1)
+
+/* discards any unclaimed data in the fifo. */
+#define	GADGETFS_FIFO_FLUSH	_IO('g',2)
+
+/* resets endpoint halt+toggle; used to implement set_interface.
+ * some hardware (like pxa2xx) can't support this.
+ */
+#define	GADGETFS_CLEAR_HALT	_IO('g',3)
+
+#endif /* __LINUX_USB_GADGETFS_H */
diff --git a/include/linux/usb_gadgetfs.h b/include/linux/usb_gadgetfs.h
deleted file mode 100644
index 8086d5a9b94e..000000000000
--- a/include/linux/usb_gadgetfs.h
+++ /dev/null
@@ -1,75 +0,0 @@
-
-#include <asm/types.h>
-#include <asm/ioctl.h>
-
-#include <linux/usb/ch9.h>
-
-/*
- * Filesystem based user-mode API to USB Gadget controller hardware
- *
- * Almost everything can be done with only read and write operations,
- * on endpoint files found in one directory.  They are configured by
- * writing descriptors, and then may be used for normal stream style
- * i/o requests.  When ep0 is configured, the device can enumerate;
- * when it's closed, the device disconnects from usb.
- *
- * Configuration and device descriptors get written to /dev/gadget/$CHIP,
- * which may then be used to read usb_gadgetfs_event structs.  The driver
- * may activate endpoints as it handles SET_CONFIGURATION setup events,
- * or earlier; writing endpoint descriptors to /dev/gadget/$ENDPOINT
- * then performing data transfers by reading or writing.
- */
-
-/*
- * Events are delivered on the ep0 file descriptor, if the user mode driver
- * reads from this file descriptor after writing the descriptors.  Don't
- * stop polling this descriptor, if you write that kind of driver.
- */
-
-enum usb_gadgetfs_event_type {
-	GADGETFS_NOP = 0,
-
-	GADGETFS_CONNECT,
-	GADGETFS_DISCONNECT,
-	GADGETFS_SETUP,
-	GADGETFS_SUSPEND,
-	// and likely more !
-};
-
-struct usb_gadgetfs_event {
-	enum usb_gadgetfs_event_type	type;
-	union {
-		// NOP, DISCONNECT, SUSPEND: nothing
-		// ... some hardware can't report disconnection
-
-		// CONNECT: just the speed
-		enum usb_device_speed	speed;
-
-		// SETUP: packet; DATA phase i/o precedes next event
-		// (setup.bmRequestType & USB_DIR_IN) flags direction 
-		// ... includes SET_CONFIGURATION, SET_INTERFACE
-		struct usb_ctrlrequest	setup;
-	} u;
-};
-
-
-/* endpoint ioctls */
-
-/* IN transfers may be reported to the gadget driver as complete
- * 	when the fifo is loaded, before the host reads the data;
- * OUT transfers may be reported to the host's "client" driver as
- * 	complete when they're sitting in the FIFO unread.
- * THIS returns how many bytes are "unclaimed" in the endpoint fifo
- * (needed for precise fault handling, when the hardware allows it)
- */
-#define	GADGETFS_FIFO_STATUS	_IO('g',1)
-
-/* discards any unclaimed data in the fifo. */
-#define	GADGETFS_FIFO_FLUSH	_IO('g',2)
-
-/* resets endpoint halt+toggle; used to implement set_interface.
- * some hardware (like pxa2xx) can't support this.
- */
-#define	GADGETFS_CLEAR_HALT	_IO('g',3)
-
-
-- 
cgit v1.3-8-gc7d7


From 51a2f077c44e559841b09de6da605b4d3ae40dad Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oneukum@suse.de>
Date: Fri, 25 May 2007 13:40:56 +0200
Subject: USB: introduce usb_anchor

- introduction of usb_anchor and its methods

Signed-off-by: Oliver Neukum <oneukum@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/hcd.c |   2 +
 drivers/usb/core/urb.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/usb.h    |  22 +++++++++++
 3 files changed, 123 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index c5a2f83991dc..87d6edf11f92 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1410,6 +1410,8 @@ void usb_hcd_giveback_urb (struct usb_hcd *hcd, struct urb *urb)
 	}
 
 	usbmon_urb_complete (&hcd->self, urb);
+	usb_unanchor_urb(urb);
+
 	/* pass ownership to the completion handler */
 	urb->complete (urb);
 	atomic_dec (&urb->use_count);
diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c
index 94ea9727ff55..ac4273dddf34 100644
--- a/drivers/usb/core/urb.c
+++ b/drivers/usb/core/urb.c
@@ -4,6 +4,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/usb.h>
+#include <linux/wait.h>
 #include "hcd.h"
 
 #define to_urb(d) container_of(d, struct urb, kref)
@@ -11,6 +12,7 @@
 static void urb_destroy(struct kref *kref)
 {
 	struct urb *urb = to_urb(kref);
+
 	kfree(urb);
 }
 
@@ -34,6 +36,7 @@ void usb_init_urb(struct urb *urb)
 		memset(urb, 0, sizeof(*urb));
 		kref_init(&urb->kref);
 		spin_lock_init(&urb->lock);
+		INIT_LIST_HEAD(&urb->anchor_list);
 	}
 }
 
@@ -100,8 +103,60 @@ struct urb * usb_get_urb(struct urb *urb)
 		kref_get(&urb->kref);
 	return urb;
 }
-		
-		
+
+/**
+ * usb_anchor_urb - anchors an URB while it is processed
+ * @urb: pointer to the urb to anchor
+ * @anchor: pointer to the anchor
+ *
+ * This can be called to have access to URBs which are to be executed
+ * without bothering to track them
+ */
+void usb_anchor_urb(struct urb *urb, struct usb_anchor *anchor)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&anchor->lock, flags);
+	usb_get_urb(urb);
+	list_add_tail(&urb->anchor_list, &anchor->urb_list);
+	urb->anchor = anchor;
+	spin_unlock_irqrestore(&anchor->lock, flags);
+}
+EXPORT_SYMBOL_GPL(usb_anchor_urb);
+
+/**
+ * usb_unanchor_urb - unanchors an URB
+ * @urb: pointer to the urb to anchor
+ *
+ * Call this to stop the system keeping track of this URB
+ */
+void usb_unanchor_urb(struct urb *urb)
+{
+	unsigned long flags;
+	struct usb_anchor *anchor;
+
+	if (!urb)
+		return;
+
+	anchor = urb->anchor;
+	if (!anchor)
+		return;
+
+	spin_lock_irqsave(&anchor->lock, flags);
+	if (unlikely(anchor != urb->anchor)) {
+		/* we've lost the race to another thread */
+		spin_unlock_irqrestore(&anchor->lock, flags);
+		return;
+	}
+	urb->anchor = NULL;
+	list_del(&urb->anchor_list);
+	spin_unlock_irqrestore(&anchor->lock, flags);
+	usb_put_urb(urb);
+	if (list_empty(&anchor->urb_list))
+		wake_up(&anchor->wait);
+}
+EXPORT_SYMBOL_GPL(usb_unanchor_urb);
+
 /*-------------------------------------------------------------------*/
 
 /**
@@ -478,6 +533,48 @@ void usb_kill_urb(struct urb *urb)
 	spin_unlock_irq(&urb->lock);
 }
 
+/**
+ * usb_kill_anchored_urbs - cancel transfer requests en masse
+ * @anchor: anchor the requests are bound to
+ *
+ * this allows all outstanding URBs to be killed starting
+ * from the back of the queue
+ */
+void usb_kill_anchored_urbs(struct usb_anchor *anchor)
+{
+	struct urb *victim;
+
+	spin_lock_irq(&anchor->lock);
+	while (!list_empty(&anchor->urb_list)) {
+		victim = list_entry(anchor->urb_list.prev, struct urb, anchor_list);
+		/* we must make sure the URB isn't freed before we kill it*/
+		usb_get_urb(victim);
+		spin_unlock_irq(&anchor->lock);
+		/* this will unanchor the URB */
+		usb_kill_urb(victim);
+		usb_put_urb(victim);
+		spin_lock_irq(&anchor->lock);
+	}
+	spin_unlock_irq(&anchor->lock);
+}
+EXPORT_SYMBOL_GPL(usb_kill_anchored_urbs);
+
+/**
+ * usb_wait_anchor_empty_timeout - wait for an anchor to be unused
+ * @anchor: the anchor you want to become unused
+ * @timeout: how long you are willing to wait in milliseconds
+ *
+ * Call this is you want to be sure all an anchor's
+ * URBs have finished
+ */
+int usb_wait_anchor_empty_timeout(struct usb_anchor *anchor,
+				  unsigned int timeout)
+{
+	return wait_event_timeout(anchor->wait, list_empty(&anchor->urb_list),
+				  msecs_to_jiffies(timeout));
+}
+EXPORT_SYMBOL_GPL(usb_wait_anchor_empty_timeout);
+
 EXPORT_SYMBOL(usb_init_urb);
 EXPORT_SYMBOL(usb_alloc_urb);
 EXPORT_SYMBOL(usb_free_urb);
@@ -485,4 +582,3 @@ EXPORT_SYMBOL(usb_get_urb);
 EXPORT_SYMBOL(usb_submit_urb);
 EXPORT_SYMBOL(usb_unlink_urb);
 EXPORT_SYMBOL(usb_kill_urb);
-
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 98e0338664fb..0873c6219efc 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1000,11 +1000,26 @@ struct usb_iso_packet_descriptor {
 
 struct urb;
 
+struct usb_anchor {
+	struct list_head urb_list;
+	wait_queue_head_t wait;
+	spinlock_t lock;
+};
+
+static inline void init_usb_anchor(struct usb_anchor *anchor)
+{
+	INIT_LIST_HEAD(&anchor->urb_list);
+	init_waitqueue_head(&anchor->wait);
+	spin_lock_init(&anchor->lock);
+}
+
 typedef void (*usb_complete_t)(struct urb *);
 
 /**
  * struct urb - USB Request Block
  * @urb_list: For use by current owner of the URB.
+ * @anchor_list: membership in the list of an anchor
+ * @anchor: to anchor URBs to a common mooring
  * @pipe: Holds endpoint number, direction, type, and more.
  *	Create these values with the eight macros available;
  *	usb_{snd,rcv}TYPEpipe(dev,endpoint), where the TYPE is "ctrl"
@@ -1177,6 +1192,8 @@ struct urb
 	/* public: documented fields in the urb that can be used by drivers */
 	struct list_head urb_list;	/* list head for use by the urb's
 					 * current owner */
+	struct list_head anchor_list;	/* the URB may be anchored by the driver */
+	struct usb_anchor *anchor;
 	struct usb_device *dev; 	/* (in) pointer to associated device */
 	unsigned int pipe;		/* (in) pipe information */
 	int status;			/* (return) non-ISO status */
@@ -1312,6 +1329,11 @@ extern struct urb *usb_get_urb(struct urb *urb);
 extern int usb_submit_urb(struct urb *urb, gfp_t mem_flags);
 extern int usb_unlink_urb(struct urb *urb);
 extern void usb_kill_urb(struct urb *urb);
+extern void usb_kill_anchored_urbs(struct usb_anchor *anchor);
+extern void usb_anchor_urb(struct urb *urb, struct usb_anchor *anchor);
+extern void usb_unanchor_urb(struct urb *urb);
+extern int usb_wait_anchor_empty_timeout(struct usb_anchor *anchor,
+					 unsigned int timeout);
 
 void *usb_buffer_alloc (struct usb_device *dev, size_t size,
 	gfp_t mem_flags, dma_addr_t *dma);
-- 
cgit v1.3-8-gc7d7


From f07600cf9eb3ee92777b2001e564faa413144a99 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 30 May 2007 15:38:16 -0400
Subject: USB: add reset_resume method

This patch (as918) introduces a new USB driver method: reset_resume.
It is called when a device needs to be reset as part of a resume
procedure (whether because of a device quirk or because of the
USB-Persist facility), thereby taking over a role formerly assigned to
the post_reset method.  As a consequence, post_reset no longer needs
an argument indicating whether it is being called as part of a
reset-resume.  This separation of functions makes the code clearer.

In addition, the pre_reset and post_reset method return types are
changed; they now must return an error code.  The return value is
unused at present, but at some later time we may unbind drivers and
re-probe if they encounter an error during reset handling.

The existing pre_reset and post_reset methods in the usbhid,
usb-storage, and hub drivers are updated to match the new
requirements.  For usbhid the post_reset routine is also used for
reset_resume (duplicate method pointers); for the other drivers a new
reset_resume routine is added.  The change to hub.c looks bigger than
it really is, because mark_children_for_reset_resume() gets moved down
next to the new hub_reset_resume() routine.

A minor change to usb-storage makes the usb_stor_report_bus_reset()
routine acquire the host lock instead of requiring the caller to hold
it already.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
CC: Matthew Dharm <mdharm-usb@one-eyed-alien.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/hid/usbhid/hid-core.c  |   9 ++--
 drivers/usb/core/driver.c      |  51 ++++++++++++++----
 drivers/usb/core/hub.c         | 117 ++++++++++++++++++++++-------------------
 drivers/usb/storage/scsiglue.c |   8 ++-
 drivers/usb/storage/usb.c      |  28 +++++++---
 include/linux/usb.h            |   7 ++-
 6 files changed, 140 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index e221b0d1f667..b2baeaeba9be 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -1009,20 +1009,22 @@ static int hid_resume(struct usb_interface *intf)
 }
 
 /* Treat USB reset pretty much the same as suspend/resume */
-static void hid_pre_reset(struct usb_interface *intf)
+static int hid_pre_reset(struct usb_interface *intf)
 {
 	/* FIXME: What if the interface is already suspended? */
 	hid_suspend(intf, PMSG_ON);
+	return 0;
 }
 
-static void hid_post_reset(struct usb_interface *intf, int reset_resume)
+/* Same routine used for post_reset and reset_resume */
+static int hid_post_reset(struct usb_interface *intf)
 {
 	struct usb_device *dev = interface_to_usbdev (intf);
 
 	hid_set_idle(dev, intf->cur_altsetting->desc.bInterfaceNumber, 0, 0);
 	/* FIXME: Any more reinitialization needed? */
 
-	hid_resume(intf);
+	return hid_resume(intf);
 }
 
 static struct usb_device_id hid_usb_ids [] = {
@@ -1039,6 +1041,7 @@ static struct usb_driver hid_driver = {
 	.disconnect =	hid_disconnect,
 	.suspend =	hid_suspend,
 	.resume =	hid_resume,
+	.reset_resume =	hid_post_reset,
 	.pre_reset =	hid_pre_reset,
 	.post_reset =	hid_post_reset,
 	.id_table =	hid_usb_ids,
diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 6c62a6d91484..3cd9af2638fc 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -915,21 +915,37 @@ static int usb_resume_interface(struct usb_interface *intf, int reset_resume)
 	}
 	driver = to_usb_driver(intf->dev.driver);
 
-	if (reset_resume && driver->post_reset)
-		driver->post_reset(intf, reset_resume);
-	else if (driver->resume) {
-		status = driver->resume(intf);
-		if (status)
-			dev_err(&intf->dev, "%s error %d\n",
-					"resume", status);
-	} else
-		dev_warn(&intf->dev, "no resume for driver %s?\n",
-				driver->name);
+	if (reset_resume) {
+		if (driver->reset_resume) {
+			status = driver->reset_resume(intf);
+			if (status)
+				dev_err(&intf->dev, "%s error %d\n",
+						"reset_resume", status);
+		} else {
+			// status = -EOPNOTSUPP;
+			dev_warn(&intf->dev, "no %s for driver %s?\n",
+					"reset_resume", driver->name);
+		}
+	} else {
+		if (driver->resume) {
+			status = driver->resume(intf);
+			if (status)
+				dev_err(&intf->dev, "%s error %d\n",
+						"resume", status);
+		} else {
+			// status = -EOPNOTSUPP;
+			dev_warn(&intf->dev, "no %s for driver %s?\n",
+					"resume", driver->name);
+		}
+	}
 
 done:
 	dev_vdbg(&intf->dev, "%s: status %d\n", __FUNCTION__, status);
 	if (status == 0)
 		mark_active(intf);
+
+	/* FIXME: Unbind the driver and reprobe if the resume failed
+	 * (not possible if auto_pm is set) */
 	return status;
 }
 
@@ -966,6 +982,18 @@ static int autosuspend_check(struct usb_device *udev)
 						"for autosuspend\n");
 				return -EOPNOTSUPP;
 			}
+
+			/* Don't allow autosuspend if the device will need
+			 * a reset-resume and any of its interface drivers
+			 * doesn't include support.
+			 */
+			if (udev->quirks & USB_QUIRK_RESET_RESUME) {
+				struct usb_driver *driver;
+
+				driver = to_usb_driver(intf->dev.driver);
+				if (!driver->reset_resume)
+					return -EOPNOTSUPP;
+			}
 		}
 	}
 
@@ -1146,7 +1174,8 @@ static int usb_resume_both(struct usb_device *udev)
 			status = usb_autoresume_device(parent);
 			if (status == 0) {
 				status = usb_resume_device(udev);
-				if (status) {
+				if (status || udev->state ==
+						USB_STATE_NOTATTACHED) {
 					usb_autosuspend_device(parent);
 
 					/* It's possible usb_resume_device()
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index ca3dbf84e800..0b8ed414d5cf 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -605,73 +605,26 @@ static void disconnect_all_children(struct usb_hub *hub, int logical)
 	}
 }
 
-#ifdef	CONFIG_USB_PERSIST
-
-#define USB_PERSIST	1
-
-/* For "persistent-device" resets we must mark the child devices for reset
- * and turn off a possible connect-change status (so khubd won't disconnect
- * them later).
- */
-static void mark_children_for_reset_resume(struct usb_hub *hub)
-{
-	struct usb_device *hdev = hub->hdev;
-	int port1;
-
-	for (port1 = 1; port1 <= hdev->maxchild; ++port1) {
-		struct usb_device *child = hdev->children[port1-1];
-
-		if (child) {
-			child->reset_resume = 1;
-			clear_port_feature(hdev, port1,
-					USB_PORT_FEAT_C_CONNECTION);
-		}
-	}
-}
-
-#else
-
-#define USB_PERSIST	0
-
-static inline void mark_children_for_reset_resume(struct usb_hub *hub)
-{ }
-
-#endif	/* CONFIG_USB_PERSIST */
-
 /* caller has locked the hub device */
-static void hub_pre_reset(struct usb_interface *intf)
+static int hub_pre_reset(struct usb_interface *intf)
 {
 	struct usb_hub *hub = usb_get_intfdata(intf);
 
-	/* This routine doesn't run as part of a reset-resume, so it's safe
-	 * to disconnect all the drivers below the hub.
-	 */
 	disconnect_all_children(hub, 0);
 	hub_quiesce(hub);
+	return 0;
 }
 
 /* caller has locked the hub device */
-static void hub_post_reset(struct usb_interface *intf, int reset_resume)
+static int hub_post_reset(struct usb_interface *intf)
 {
 	struct usb_hub *hub = usb_get_intfdata(intf);
 
 	hub_power_on(hub);
-	if (reset_resume) {
-		if (USB_PERSIST)
-			mark_children_for_reset_resume(hub);
-		else {
-			/* Reset-resume doesn't call pre_reset, so we have to
-			 * disconnect the children here.  But we may not lock
-			 * the child devices, so we have to do a "logical"
-			 * disconnect.
-			 */
-			disconnect_all_children(hub, 1);
-		}
-	}
 	hub_activate(hub);
+	return 0;
 }
 
-
 static int hub_configure(struct usb_hub *hub,
 	struct usb_endpoint_descriptor *endpoint)
 {
@@ -1931,6 +1884,58 @@ static int hub_resume(struct usb_interface *intf)
 	return 0;
 }
 
+#ifdef	CONFIG_USB_PERSIST
+
+#define USB_PERSIST	1
+
+/* For "persistent-device" resets we must mark the child devices for reset
+ * and turn off a possible connect-change status (so khubd won't disconnect
+ * them later).
+ */
+static void mark_children_for_reset_resume(struct usb_hub *hub)
+{
+	struct usb_device *hdev = hub->hdev;
+	int port1;
+
+	for (port1 = 1; port1 <= hdev->maxchild; ++port1) {
+		struct usb_device *child = hdev->children[port1-1];
+
+		if (child) {
+			child->reset_resume = 1;
+			clear_port_feature(hdev, port1,
+					USB_PORT_FEAT_C_CONNECTION);
+		}
+	}
+}
+
+#else
+
+#define USB_PERSIST	0
+
+static inline void mark_children_for_reset_resume(struct usb_hub *hub)
+{ }
+
+#endif	/* CONFIG_USB_PERSIST */
+
+static int hub_reset_resume(struct usb_interface *intf)
+{
+	struct usb_hub *hub = usb_get_intfdata(intf);
+
+	hub_power_on(hub);
+	if (USB_PERSIST)
+		mark_children_for_reset_resume(hub);
+	else {
+		/* Reset-resume doesn't call pre_reset, so we have to
+		 * disconnect the children here.  But we may not lock
+		 * the child devices, so we have to do a "logical"
+		 * disconnect.
+		 */
+		disconnect_all_children(hub, 1);
+	}
+	hub_activate(hub);
+	return 0;
+}
+
 #else	/* CONFIG_PM */
 
 static inline int remote_wakeup(struct usb_device *udev)
@@ -1938,8 +1943,9 @@ static inline int remote_wakeup(struct usb_device *udev)
 	return 0;
 }
 
-#define hub_suspend NULL
-#define hub_resume NULL
+#define hub_suspend		NULL
+#define hub_resume		NULL
+#define hub_reset_resume	NULL
 #endif
 
 
@@ -2768,6 +2774,7 @@ static struct usb_driver hub_driver = {
 	.disconnect =	hub_disconnect,
 	.suspend =	hub_suspend,
 	.resume =	hub_resume,
+	.reset_resume =	hub_reset_resume,
 	.pre_reset =	hub_pre_reset,
 	.post_reset =	hub_post_reset,
 	.ioctl =	hub_ioctl,
@@ -3021,6 +3028,7 @@ int usb_reset_composite_device(struct usb_device *udev,
 				drv = to_usb_driver(cintf->dev.driver);
 				if (drv->pre_reset)
 					(drv->pre_reset)(cintf);
+	/* FIXME: Unbind if pre_reset returns an error or isn't defined */
 			}
 		}
 	}
@@ -3038,7 +3046,8 @@ int usb_reset_composite_device(struct usb_device *udev,
 					cintf->dev.driver) {
 				drv = to_usb_driver(cintf->dev.driver);
 				if (drv->post_reset)
-					(drv->post_reset)(cintf, 0);
+					(drv->post_reset)(cintf);
+	/* FIXME: Unbind if post_reset returns an error or isn't defined */
 			}
 			if (cintf != iface)
 				up(&cintf->dev.sem);
diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c
index e227f64d5641..1ba19eaa1970 100644
--- a/drivers/usb/storage/scsiglue.c
+++ b/drivers/usb/storage/scsiglue.c
@@ -321,10 +321,14 @@ void usb_stor_report_device_reset(struct us_data *us)
 
 /* Report a driver-initiated bus reset to the SCSI layer.
  * Calling this for a SCSI-initiated reset is unnecessary but harmless.
- * The caller must own the SCSI host lock. */
+ * The caller must not own the SCSI host lock. */
 void usb_stor_report_bus_reset(struct us_data *us)
 {
-	scsi_report_bus_reset(us_to_host(us), 0);
+	struct Scsi_Host *host = us_to_host(us);
+
+	scsi_lock(host);
+	scsi_report_bus_reset(host, 0);
+	scsi_unlock(host);
 }
 
 /***********************************************************************
diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c
index be4cd8fe4ce6..00521f1d6a6b 100644
--- a/drivers/usb/storage/usb.c
+++ b/drivers/usb/storage/usb.c
@@ -219,6 +219,20 @@ static int storage_resume(struct usb_interface *iface)
 	return 0;
 }
 
+static int storage_reset_resume(struct usb_interface *iface)
+{
+	struct us_data *us = usb_get_intfdata(iface);
+
+	US_DEBUGP("%s\n", __FUNCTION__);
+
+	/* Report the reset to the SCSI core */
+	usb_stor_report_bus_reset(us);
+
+	/* FIXME: Notify the subdrivers that they need to reinitialize
+	 * the device */
+	return 0;
+}
+
 #endif /* CONFIG_PM */
 
 /*
@@ -226,7 +240,7 @@ static int storage_resume(struct usb_interface *iface)
  * a USB port reset, whether from this driver or a different one.
  */
 
-static void storage_pre_reset(struct usb_interface *iface)
+static int storage_pre_reset(struct usb_interface *iface)
 {
 	struct us_data *us = usb_get_intfdata(iface);
 
@@ -234,26 +248,23 @@ static void storage_pre_reset(struct usb_interface *iface)
 
 	/* Make sure no command runs during the reset */
 	mutex_lock(&us->dev_mutex);
+	return 0;
 }
 
-static void storage_post_reset(struct usb_interface *iface, int reset_resume)
+static int storage_post_reset(struct usb_interface *iface)
 {
 	struct us_data *us = usb_get_intfdata(iface);
 
 	US_DEBUGP("%s\n", __FUNCTION__);
 
 	/* Report the reset to the SCSI core */
-	scsi_lock(us_to_host(us));
 	usb_stor_report_bus_reset(us);
-	scsi_unlock(us_to_host(us));
 
 	/* FIXME: Notify the subdrivers that they need to reinitialize
 	 * the device */
 
-	/* If this is a reset-resume then the pre_reset routine wasn't
-	 * called, so we don't need to unlock the mutex. */
-	if (!reset_resume)
-		mutex_unlock(&us->dev_mutex);
+	mutex_unlock(&us->dev_mutex);
+	return 0;
 }
 
 /*
@@ -1061,6 +1072,7 @@ static struct usb_driver usb_storage_driver = {
 #ifdef CONFIG_PM
 	.suspend =	storage_suspend,
 	.resume =	storage_resume,
+	.reset_resume =	storage_reset_resume,
 #endif
 	.pre_reset =	storage_pre_reset,
 	.post_reset =	storage_post_reset,
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 0873c6219efc..bde8c65e2bfc 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -839,6 +839,8 @@ struct usbdrv_wrap {
  *	do (or don't) show up otherwise in the filesystem.
  * @suspend: Called when the device is going to be suspended by the system.
  * @resume: Called when the device is being resumed by the system.
+ * @reset_resume: Called when the suspended device has been reset instead
+ *	of being resumed.
  * @pre_reset: Called by usb_reset_composite_device() when the device
  *	is about to be reset.
  * @post_reset: Called by usb_reset_composite_device() after the device
@@ -885,9 +887,10 @@ struct usb_driver {
 
 	int (*suspend) (struct usb_interface *intf, pm_message_t message);
 	int (*resume) (struct usb_interface *intf);
+	int (*reset_resume)(struct usb_interface *intf);
 
-	void (*pre_reset) (struct usb_interface *intf);
-	void (*post_reset) (struct usb_interface *intf, int reset_resume);
+	int (*pre_reset)(struct usb_interface *intf);
+	int (*post_reset)(struct usb_interface *intf);
 
 	const struct usb_device_id *id_table;
 
-- 
cgit v1.3-8-gc7d7


From b41a60eca833d76593d4dac8a59f5c38714194ee Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 30 May 2007 15:39:33 -0400
Subject: USB: add power/persist device attribute

This patch (as920) adds an extra level of protection to the
USB-Persist facility.  Now it will apply by default only to hubs; for
all other devices the user must enable it explicitly by setting the
power/persist device attribute.

The disconnect_all_children() routine in hub.c has been removed and
its code placed inline.  This is the way it was originally as part of
hub_pre_reset(); the revised usage in hub_reset_resume() is
sufficiently different that the code can no longer be shared.
Likewise, mark_children_for_reset() is now inline as part of
hub_reset_resume().  The end result looks much cleaner than before.

The sysfs interface is updated to add the new attribute file, and
there are corresponding documentation updates.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/ABI/testing/sysfs-bus-usb | 13 ++++++
 Documentation/usb/persist.txt           | 38 ++++++++++------
 drivers/usb/core/Kconfig                | 13 +++---
 drivers/usb/core/hub.c                  | 78 ++++++++++++---------------------
 drivers/usb/core/sysfs.c                | 75 ++++++++++++++++++++++++++++++-
 include/linux/usb.h                     |  1 +
 6 files changed, 149 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb
index f9937add033d..9734577d1711 100644
--- a/Documentation/ABI/testing/sysfs-bus-usb
+++ b/Documentation/ABI/testing/sysfs-bus-usb
@@ -39,3 +39,16 @@ Description:
 		If you want to suspend a device immediately but leave it
 		free to wake up in response to I/O requests, you should
 		write "0" to power/autosuspend.
+
+What:		/sys/bus/usb/devices/.../power/persist
+Date:		May 2007
+KernelVersion:	2.6.23
+Contact:	Alan Stern <stern@rowland.harvard.edu>
+Description:
+		If CONFIG_USB_PERSIST is set, then each USB device directory
+		will contain a file named power/persist.  The file holds a
+		boolean value (0 or 1) indicating whether or not the
+		"USB-Persist" facility is enabled for the device.  Since the
+		facility is inherently dangerous, it is disabled by default
+		for all devices except hubs.  For more information, see
+		Documentation/usb/persist.txt.
diff --git a/Documentation/usb/persist.txt b/Documentation/usb/persist.txt
index 6dcd5f884795..df54d645cbb5 100644
--- a/Documentation/usb/persist.txt
+++ b/Documentation/usb/persist.txt
@@ -2,7 +2,7 @@
 
 		   Alan Stern <stern@rowland.harvard.edu>
 
-		 September 2, 2006 (Updated March 27, 2007)
+		 September 2, 2006 (Updated May 29, 2007)
 
 
 	What is the problem?
@@ -52,9 +52,9 @@ you can convince the BIOS supplier to fix the problem (lots of luck!).
 
 On many systems the USB host controllers will get reset after a
 suspend-to-RAM.  On almost all systems, no suspend current is
-available during suspend-to-disk (also known as swsusp).  You can
-check the kernel log after resuming to see if either of these has
-happened; look for lines saying "root hub lost power or was reset".
+available during hibernation (also known as swsusp or suspend-to-disk).
+You can check the kernel log after resuming to see if either of these
+has happened; look for lines saying "root hub lost power or was reset".
 
 In practice, people are forced to unmount any filesystems on a USB
 device before suspending.  If the root filesystem is on a USB device,
@@ -71,15 +71,16 @@ structures are allowed to persist across a power-session disruption.
 It works like this.  If the kernel sees that a USB host controller is
 not in the expected state during resume (i.e., if the controller was
 reset or otherwise had lost power) then it applies a persistence check
-to each of the USB devices below that controller.  It doesn't try to
-resume the device; that can't work once the power session is gone.
-Instead it issues a USB port reset and then re-enumerates the device.
-(This is exactly the same thing that happens whenever a USB device is
-reset.)  If the re-enumeration shows that the device now attached to
-that port has the same descriptors as before, including the Vendor and
-Product IDs, then the kernel continues to use the same device
-structure.  In effect, the kernel treats the device as though it had
-merely been reset instead of unplugged.
+to each of the USB devices below that controller for which the
+"persist" attribute is set.  It doesn't try to resume the device; that
+can't work once the power session is gone.  Instead it issues a USB
+port reset and then re-enumerates the device.  (This is exactly the
+same thing that happens whenever a USB device is reset.)  If the
+re-enumeration shows that the device now attached to that port has the
+same descriptors as before, including the Vendor and Product IDs, then
+the kernel continues to use the same device structure.  In effect, the
+kernel treats the device as though it had merely been reset instead of
+unplugged.
 
 If no device is now attached to the port, or if the descriptors are
 different from what the kernel remembers, then the treatment is what
@@ -91,6 +92,17 @@ The end result is that the USB device remains available and usable.
 Filesystem mounts and memory mappings are unaffected, and the world is
 now a good and happy place.
 
+Note that even when CONFIG_USB_PERSIST is set, the "persist" feature
+will be applied only to those devices for which it is enabled.  You
+can enable the feature by doing (as root):
+
+	echo 1 >/sys/bus/usb/devices/.../power/persist
+
+where the "..." should be filled in the with the device's ID.  Disable
+the feature by writing 0 instead of 1.  For hubs the feature is
+automatically and permanently enabled, so you only have to worry about
+setting it for devices where it really matters.
+
 
 	Is this the best solution?
 
diff --git a/drivers/usb/core/Kconfig b/drivers/usb/core/Kconfig
index 5113ef4cb7f6..97b09f282705 100644
--- a/drivers/usb/core/Kconfig
+++ b/drivers/usb/core/Kconfig
@@ -91,12 +91,15 @@ config USB_PERSIST
 	depends on USB && PM && EXPERIMENTAL
 	default n
 	help
-	  If you say Y here, USB device data structures will remain
+
+	  If you say Y here and enable the "power/persist" attribute
+	  for a USB device, the device's data structures will remain
 	  persistent across system suspend, even if the USB bus loses
-	  power.  (This includes software-suspend, also known as swsusp,
-	  or suspend-to-disk.)  The devices will reappear as if by magic
-	  when the system wakes up, with no need to unmount USB filesystems,
-	  rmmod host-controller drivers, or do anything else.
+	  power.  (This includes hibernation, also known as swsusp or
+	  suspend-to-disk.)  The devices will reappear as if by magic
+	  when the system wakes up, with no need to unmount USB
+	  filesystems, rmmod host-controller drivers, or do anything
+	  else.
 
 	  	WARNING: This option can be dangerous!
 
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index c4cdb69a6e9e..50e79010401c 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -596,27 +596,18 @@ static void hub_port_logical_disconnect(struct usb_hub *hub, int port1)
  	kick_khubd(hub);
 }
 
-static void disconnect_all_children(struct usb_hub *hub, int logical)
-{
-	struct usb_device *hdev = hub->hdev;
-	int port1;
-
-	for (port1 = 1; port1 <= hdev->maxchild; ++port1) {
-		if (hdev->children[port1-1]) {
-			if (logical)
-				hub_port_logical_disconnect(hub, port1);
-			else
-				usb_disconnect(&hdev->children[port1-1]);
-		}
-	}
-}
-
 /* caller has locked the hub device */
 static int hub_pre_reset(struct usb_interface *intf)
 {
 	struct usb_hub *hub = usb_get_intfdata(intf);
+	struct usb_device *hdev = hub->hdev;
+	int i;
 
-	disconnect_all_children(hub, 0);
+	/* Disconnect all the children */
+	for (i = 0; i < hdev->maxchild; ++i) {
+		if (hdev->children[i])
+			usb_disconnect(&hdev->children[i]);
+	}
 	hub_quiesce(hub);
 	return 0;
 }
@@ -1872,50 +1863,39 @@ static int hub_resume(struct usb_interface *intf)
 	return 0;
 }
 
-#ifdef	CONFIG_USB_PERSIST
-
-/* For "persistent-device" resets we must mark the child devices for reset
- * and turn off a possible connect-change status (so khubd won't disconnect
- * them later).
- */
-static void mark_children_for_reset_resume(struct usb_hub *hub)
+static int hub_reset_resume(struct usb_interface *intf)
 {
+	struct usb_hub *hub = usb_get_intfdata(intf);
 	struct usb_device *hdev = hub->hdev;
 	int port1;
 
+	hub_power_on(hub);
+
 	for (port1 = 1; port1 <= hdev->maxchild; ++port1) {
 		struct usb_device *child = hdev->children[port1-1];
 
 		if (child) {
-			child->reset_resume = 1;
-			clear_port_feature(hdev, port1,
-					USB_PORT_FEAT_C_CONNECTION);
+
+			/* For "USB_PERSIST"-enabled children we must
+			 * mark the child device for reset-resume and
+			 * turn off the connect-change status to prevent
+			 * khubd from disconnecting it later.
+			 */
+			if (USB_PERSIST && child->persist_enabled) {
+				child->reset_resume = 1;
+				clear_port_feature(hdev, port1,
+						USB_PORT_FEAT_C_CONNECTION);
+
+			/* Otherwise we must disconnect the child,
+			 * but as we may not lock the child device here
+			 * we have to do a "logical" disconnect.
+			 */
+			} else {
+				hub_port_logical_disconnect(hub, port1);
+			}
 		}
 	}
-}
-
-#else
-
-static inline void mark_children_for_reset_resume(struct usb_hub *hub)
-{ }
-
-#endif	/* CONFIG_USB_PERSIST */
-
-static int hub_reset_resume(struct usb_interface *intf)
-{
-	struct usb_hub *hub = usb_get_intfdata(intf);
 
-	hub_power_on(hub);
-	if (USB_PERSIST)
-		mark_children_for_reset_resume(hub);
-	else {
-		/* Reset-resume doesn't call pre_reset, so we have to
-		 * disconnect the children here.  But we may not lock
-		 * the child devices, so we have to do a "logical"
-		 * disconnect.
-		 */
-		disconnect_all_children(hub, 1);
-	}
 	hub_activate(hub);
 	return 0;
 }
diff --git a/drivers/usb/core/sysfs.c b/drivers/usb/core/sysfs.c
index be37c863fdfb..5dfe31bc32ba 100644
--- a/drivers/usb/core/sysfs.c
+++ b/drivers/usb/core/sysfs.c
@@ -169,6 +169,73 @@ show_quirks(struct device *dev, struct device_attribute *attr, char *buf)
 }
 static DEVICE_ATTR(quirks, S_IRUGO, show_quirks, NULL);
 
+
+#if defined(CONFIG_USB_PERSIST) || defined(CONFIG_USB_SUSPEND)
+static const char power_group[] = "power";
+#endif
+
+#ifdef	CONFIG_USB_PERSIST
+
+static ssize_t
+show_persist(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct usb_device *udev = to_usb_device(dev);
+
+	return sprintf(buf, "%d\n", udev->persist_enabled);
+}
+
+static ssize_t
+set_persist(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct usb_device *udev = to_usb_device(dev);
+	int value;
+
+	/* Hubs are always enabled for USB_PERSIST */
+	if (udev->descriptor.bDeviceClass == USB_CLASS_HUB)
+		return -EPERM;
+
+	if (sscanf(buf, "%d", &value) != 1)
+		return -EINVAL;
+	usb_pm_lock(udev);
+	udev->persist_enabled = !!value;
+	usb_pm_unlock(udev);
+	return count;
+}
+
+static DEVICE_ATTR(persist, S_IRUGO | S_IWUSR, show_persist, set_persist);
+
+static int add_persist_attributes(struct device *dev)
+{
+	int rc = 0;
+
+	if (is_usb_device(dev)) {
+		struct usb_device *udev = to_usb_device(dev);
+
+		/* Hubs are automatically enabled for USB_PERSIST */
+		if (udev->descriptor.bDeviceClass == USB_CLASS_HUB)
+			udev->persist_enabled = 1;
+		rc = sysfs_add_file_to_group(&dev->kobj,
+				&dev_attr_persist.attr,
+				power_group);
+	}
+	return rc;
+}
+
+static void remove_persist_attributes(struct device *dev)
+{
+	sysfs_remove_file_from_group(&dev->kobj,
+			&dev_attr_persist.attr,
+			power_group);
+}
+
+#else
+
+#define add_persist_attributes(dev)	0
+#define remove_persist_attributes(dev)	do {} while (0)
+
+#endif	/* CONFIG_USB_PERSIST */
+
 #ifdef	CONFIG_USB_SUSPEND
 
 static ssize_t
@@ -276,8 +343,6 @@ set_level(struct device *dev, struct device_attribute *attr,
 
 static DEVICE_ATTR(level, S_IRUGO | S_IWUSR, show_level, set_level);
 
-static char power_group[] = "power";
-
 static int add_power_attributes(struct device *dev)
 {
 	int rc = 0;
@@ -311,6 +376,7 @@ static void remove_power_attributes(struct device *dev)
 
 #endif	/* CONFIG_USB_SUSPEND */
 
+
 /* Descriptor fields */
 #define usb_descriptor_attr_le16(field, format_string)			\
 static ssize_t								\
@@ -384,6 +450,10 @@ int usb_create_sysfs_dev_files(struct usb_device *udev)
 	if (retval)
 		return retval;
 
+	retval = add_persist_attributes(dev);
+	if (retval)
+		goto error;
+
 	retval = add_power_attributes(dev);
 	if (retval)
 		goto error;
@@ -421,6 +491,7 @@ void usb_remove_sysfs_dev_files(struct usb_device *udev)
 	device_remove_file(dev, &dev_attr_product);
 	device_remove_file(dev, &dev_attr_serial);
 	remove_power_attributes(dev);
+	remove_persist_attributes(dev);
 	sysfs_remove_group(&dev->kobj, &dev_attr_grp);
 }
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index bde8c65e2bfc..efce9a4c511c 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -404,6 +404,7 @@ struct usb_device {
 	unsigned auto_pm:1;		/* autosuspend/resume in progress */
 	unsigned do_remote_wakeup:1;	/* remote wakeup should be enabled */
 	unsigned reset_resume:1;	/* needs reset instead of resume */
+	unsigned persist_enabled:1;	/* USB_PERSIST enabled for this dev */
 	unsigned autosuspend_disabled:1; /* autosuspend and autoresume */
 	unsigned autoresume_disabled:1;  /*  disabled by the user */
 #endif
-- 
cgit v1.3-8-gc7d7


From 8b3b01c898a44c2fc7217eb579982b9d132113f5 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Wed, 13 Jun 2007 08:02:11 +0200
Subject: USB: Add URB_FREE_BUFFER flag and the logic behind it

USB: Add URB_FREE_BUFFER flag for freeing the transfer buffer

In some cases it is not needed that the driver keeps track of the
transfer buffer of an URB. It can be simply freed along with the
URB itself when the reference count goes down to zero. The new
flag URB_FREE_BUFFER enables this behavior.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/urb.c | 3 +++
 include/linux/usb.h    | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c
index ac4273dddf34..52ec44b828f3 100644
--- a/drivers/usb/core/urb.c
+++ b/drivers/usb/core/urb.c
@@ -13,6 +13,9 @@ static void urb_destroy(struct kref *kref)
 {
 	struct urb *urb = to_urb(kref);
 
+	if (urb->transfer_flags & URB_FREE_BUFFER)
+		kfree(urb->transfer_buffer);
+
 	kfree(urb);
 }
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index efce9a4c511c..533c32374e01 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -994,6 +994,7 @@ extern int usb_disabled(void);
 #define URB_ZERO_PACKET		0x0040	/* Finish bulk OUT with short packet */
 #define URB_NO_INTERRUPT	0x0080	/* HINT: no non-error interrupt
 					 * needed */
+#define URB_FREE_BUFFER		0x0100	/* Free transfer buffer with the URB */
 
 struct usb_iso_packet_descriptor {
 	unsigned int offset;
-- 
cgit v1.3-8-gc7d7


From 165fe97ed6107d3cde63592d5ac36400a5eb9f6f Mon Sep 17 00:00:00 2001
From: "Craig W. Nadler" <craig@nadler.us>
Date: Fri, 15 Jun 2007 23:14:35 -0400
Subject: USB: add IAD support to usbfs and sysfs

USB_IAD: Adds support for USB Interface Association Descriptors.

This patch adds support to the USB host stack for parsing, storing, and
displaying Interface Association Descriptors. In /proc/bus/usb/devices
lines starting with A: show the fields in an IAD. In sysfs if an
interface on a USB device is referenced by an IAD the following files
will be added to the sysfs directory for that interface:
iad_bFirstInterface, iad_bInterfaceCount, iad_bFunctionClass, and
iad_bFunctionSubClass, iad_bFunctionProtocol

Signed-off-by: Craig W. Nadler <craig@nadler.us>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/config.c  | 15 +++++++++++++++
 drivers/usb/core/devices.c | 26 ++++++++++++++++++++++++++
 drivers/usb/core/message.c | 31 +++++++++++++++++++++++++++++++
 drivers/usb/core/sysfs.c   | 34 ++++++++++++++++++++++++++++++++++
 include/linux/usb.h        | 10 ++++++++++
 5 files changed, 116 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
index 5e113db41b59..cb69aa1e02e8 100644
--- a/drivers/usb/core/config.c
+++ b/drivers/usb/core/config.c
@@ -295,6 +295,7 @@ static int usb_parse_configuration(struct device *ddev, int cfgidx,
 	struct usb_descriptor_header *header;
 	int len, retval;
 	u8 inums[USB_MAXINTERFACES], nalts[USB_MAXINTERFACES];
+	unsigned iad_num = 0;
 
 	memcpy(&config->desc, buffer, USB_DT_CONFIG_SIZE);
 	if (config->desc.bDescriptorType != USB_DT_CONFIG ||
@@ -372,6 +373,20 @@ static int usb_parse_configuration(struct device *ddev, int cfgidx,
 				++n;
 			}
 
+		} else if (header->bDescriptorType ==
+				USB_DT_INTERFACE_ASSOCIATION) {
+			if (iad_num == USB_MAXIADS) {
+				dev_warn(ddev, "found more Interface "
+					       "Association Descriptors "
+					       "than allocated for in "
+					       "configuration %d\n", cfgno);
+			} else {
+				config->intf_assoc[iad_num] =
+					(struct usb_interface_assoc_descriptor
+					*)header;
+				iad_num++;
+			}
+
 		} else if (header->bDescriptorType == USB_DT_DEVICE ||
 			    header->bDescriptorType == USB_DT_CONFIG)
 			dev_warn(ddev, "config %d contains an unexpected "
diff --git a/drivers/usb/core/devices.c b/drivers/usb/core/devices.c
index 6753ca059ee4..87c794d60aa0 100644
--- a/drivers/usb/core/devices.c
+++ b/drivers/usb/core/devices.c
@@ -102,6 +102,10 @@ static const char *format_config =
 /* C:  #Ifs=dd Cfg#=dd Atr=xx MPwr=dddmA */
   "C:%c #Ifs=%2d Cfg#=%2d Atr=%02x MxPwr=%3dmA\n";
   
+static const char *format_iad =
+/* A:  FirstIf#=dd IfCount=dd Cls=xx(sssss) Sub=xx Prot=xx */
+  "A:  FirstIf#=%2d IfCount=%2d Cls=%02x(%-5s) Sub=%02x Prot=%02x\n";
+
 static const char *format_iface =
 /* I:  If#=dd Alt=dd #EPs=dd Cls=xx(sssss) Sub=xx Prot=xx Driver=xxxx*/
   "I:%c If#=%2d Alt=%2d #EPs=%2d Cls=%02x(%-5s) Sub=%02x Prot=%02x Driver=%s\n";
@@ -146,6 +150,7 @@ static const struct class_info clas_info[] =
 	{USB_CLASS_STILL_IMAGE,		"still"},
 	{USB_CLASS_CSCID,		"scard"},
 	{USB_CLASS_CONTENT_SEC,		"c-sec"},
+	{USB_CLASS_VIDEO,		"video"},
 	{-1,				"unk."}		/* leave as last */
 };
 
@@ -286,6 +291,21 @@ static char *usb_dump_interface(
 	return start;
 }
 
+static char *usb_dump_iad_descriptor(char *start, char *end,
+	const struct usb_interface_assoc_descriptor *iad)
+{
+	if (start > end)
+		return start;
+	start += sprintf(start, format_iad,
+			 iad->bFirstInterface,
+			 iad->bInterfaceCount,
+			 iad->bFunctionClass,
+			 class_decode(iad->bFunctionClass),
+			 iad->bFunctionSubClass,
+			 iad->bFunctionProtocol);
+	return start;
+}
+
 /* TBD:
  * 0. TBDs
  * 1. marking active interface altsettings (code lists all, but should mark
@@ -322,6 +342,12 @@ static char *usb_dump_config (
 	if (!config)		/* getting these some in 2.3.7; none in 2.3.6 */
 		return start + sprintf(start, "(null Cfg. desc.)\n");
 	start = usb_dump_config_descriptor(start, end, &config->desc, active);
+	for (i = 0; i < USB_MAXIADS; i++) {
+		if (config->intf_assoc[i] == NULL)
+			break;
+		start = usb_dump_iad_descriptor(start, end,
+					config->intf_assoc[i]);
+	}
 	for (i = 0; i < config->desc.bNumInterfaces; i++) {
 		intfc = config->intf_cache[i];
 		interface = config->interface[i];
diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index 4c1432314711..530e854961ce 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -1384,6 +1384,36 @@ struct device_type usb_if_device_type = {
 	.uevent =	usb_if_uevent,
 };
 
+static struct usb_interface_assoc_descriptor *find_iad(struct usb_device *dev,
+						       struct usb_host_config *config,
+						       u8 inum)
+{
+	struct usb_interface_assoc_descriptor *retval = NULL;
+	struct usb_interface_assoc_descriptor *intf_assoc;
+	int first_intf;
+	int last_intf;
+	int i;
+
+	for (i = 0; (i < USB_MAXIADS && config->intf_assoc[i]); i++) {
+		intf_assoc = config->intf_assoc[i];
+		if (intf_assoc->bInterfaceCount == 0)
+			continue;
+
+		first_intf = intf_assoc->bFirstInterface;
+		last_intf = first_intf + (intf_assoc->bInterfaceCount - 1);
+		if (inum >= first_intf && inum <= last_intf) {
+			if (!retval)
+				retval = intf_assoc;
+			else
+				dev_err(&dev->dev, "Interface #%d referenced"
+					" by multiple IADs\n", inum);
+		}
+	}
+
+	return retval;
+}
+
+
 /*
  * usb_set_configuration - Makes a particular device setting be current
  * @dev: the device whose configuration is being updated
@@ -1530,6 +1560,7 @@ free_interfaces:
 		intfc = cp->intf_cache[i];
 		intf->altsetting = intfc->altsetting;
 		intf->num_altsetting = intfc->num_altsetting;
+		intf->intf_assoc = find_iad(dev, cp, i);
 		kref_get(&intfc->ref);
 
 		alt = usb_altnum_to_altsetting(intf, 0);
diff --git a/drivers/usb/core/sysfs.c b/drivers/usb/core/sysfs.c
index 5dfe31bc32ba..d47ae89154a7 100644
--- a/drivers/usb/core/sysfs.c
+++ b/drivers/usb/core/sysfs.c
@@ -495,6 +495,25 @@ void usb_remove_sysfs_dev_files(struct usb_device *udev)
 	sysfs_remove_group(&dev->kobj, &dev_attr_grp);
 }
 
+/* Interface Accociation Descriptor fields */
+#define usb_intf_assoc_attr(field, format_string)			\
+static ssize_t								\
+show_iad_##field (struct device *dev, struct device_attribute *attr,	\
+		char *buf)						\
+{									\
+	struct usb_interface *intf = to_usb_interface (dev);		\
+									\
+	return sprintf (buf, format_string,				\
+			intf->intf_assoc->field); 		\
+}									\
+static DEVICE_ATTR(iad_##field, S_IRUGO, show_iad_##field, NULL);
+
+usb_intf_assoc_attr (bFirstInterface, "%02x\n")
+usb_intf_assoc_attr (bInterfaceCount, "%02d\n")
+usb_intf_assoc_attr (bFunctionClass, "%02x\n")
+usb_intf_assoc_attr (bFunctionSubClass, "%02x\n")
+usb_intf_assoc_attr (bFunctionProtocol, "%02x\n")
+
 /* Interface fields */
 #define usb_intf_attr(field, format_string)				\
 static ssize_t								\
@@ -558,6 +577,18 @@ static ssize_t show_modalias(struct device *dev,
 }
 static DEVICE_ATTR(modalias, S_IRUGO, show_modalias, NULL);
 
+static struct attribute *intf_assoc_attrs[] = {
+	&dev_attr_iad_bFirstInterface.attr,
+	&dev_attr_iad_bInterfaceCount.attr,
+	&dev_attr_iad_bFunctionClass.attr,
+	&dev_attr_iad_bFunctionSubClass.attr,
+	&dev_attr_iad_bFunctionProtocol.attr,
+	NULL,
+};
+static struct attribute_group intf_assoc_attr_grp = {
+	.attrs = intf_assoc_attrs,
+};
+
 static struct attribute *intf_attrs[] = {
 	&dev_attr_bInterfaceNumber.attr,
 	&dev_attr_bAlternateSetting.attr,
@@ -609,6 +640,8 @@ int usb_create_sysfs_intf_files(struct usb_interface *intf)
 		alt->string = usb_cache_string(udev, alt->desc.iInterface);
 	if (alt->string)
 		retval = device_create_file(dev, &dev_attr_interface);
+	if (intf->intf_assoc)
+		retval = sysfs_create_group(&dev->kobj, &intf_assoc_attr_grp);
 	usb_create_intf_ep_files(intf, udev);
 	return 0;
 }
@@ -620,4 +653,5 @@ void usb_remove_sysfs_intf_files(struct usb_interface *intf)
 	usb_remove_intf_ep_files(intf);
 	device_remove_file(dev, &dev_attr_interface);
 	sysfs_remove_group(&dev->kobj, &intf_attr_grp);
+	sysfs_remove_group(&intf->dev.kobj, &intf_assoc_attr_grp);
 }
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 533c32374e01..7a60946df3b6 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -146,6 +146,10 @@ struct usb_interface {
 					 * active alternate setting */
 	unsigned num_altsetting;	/* number of alternate settings */
 
+	/* If there is an interface association descriptor then it will list
+	 * the associated interfaces */
+	struct usb_interface_assoc_descriptor *intf_assoc;
+
 	int minor;			/* minor number this interface is
 					 * bound to */
 	enum usb_interface_condition condition;		/* state of binding */
@@ -175,6 +179,7 @@ void usb_put_intf(struct usb_interface *intf);
 
 /* this maximum is arbitrary */
 #define USB_MAXINTERFACES	32
+#define USB_MAXIADS		USB_MAXINTERFACES/2
 
 /**
  * struct usb_interface_cache - long-term representation of a device interface
@@ -245,6 +250,11 @@ struct usb_host_config {
 	struct usb_config_descriptor	desc;
 
 	char *string;		/* iConfiguration string, if present */
+
+	/* List of any Interface Association Descriptors in this
+	 * configuration. */
+	struct usb_interface_assoc_descriptor *intf_assoc[USB_MAXIADS];
+
 	/* the interfaces associated with this configuration,
 	 * stored in no particular order */
 	struct usb_interface *interface[USB_MAXINTERFACES];
-- 
cgit v1.3-8-gc7d7


From 9d8bab58b758cd5a96d368a8cc64111c9ab50407 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Sun, 1 Jul 2007 11:04:54 -0700
Subject: usb gadget stack: remove usb_ep_*_buffer(), part 1

Remove usb_ep_{alloc,free}_buffer() calls, for small dma-coherent buffers.
This patch just removes the interface and its users; later patches will
remove controller driver support.

  - This interface is invariably not implemented correctly in the
    controller drivers (e.g. using dma pools, a mechanism which
    post-dates the interface by several years).

  - At this point no gadget driver really *needs* to use it.  In
    current kernels, any driver that needs such a mechanism could
    allocate a dma pool themselves.

Removing this interface is thus a simplification and improvement.

Note that the gmidi.c driver had a bug in this area; fixed.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/gadget/file_storage.c | 19 +++++-------------
 drivers/usb/gadget/gmidi.c        |  8 +-------
 drivers/usb/gadget/inode.c        |  4 ++--
 drivers/usb/gadget/zero.c         |  9 +++------
 include/linux/usb_gadget.h        | 41 ---------------------------------------
 5 files changed, 11 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c
index 7e650d015585..8712ef987179 100644
--- a/drivers/usb/gadget/file_storage.c
+++ b/drivers/usb/gadget/file_storage.c
@@ -3733,19 +3733,12 @@ static void /* __init_or_exit */ fsg_unbind(struct usb_gadget *gadget)
 	}
 
 	/* Free the data buffers */
-	for (i = 0; i < NUM_BUFFERS; ++i) {
-		struct fsg_buffhd	*bh = &fsg->buffhds[i];
-
-		if (bh->buf)
-			usb_ep_free_buffer(fsg->bulk_in, bh->buf, bh->dma,
-					mod_data.buflen);
-	}
+	for (i = 0; i < NUM_BUFFERS; ++i)
+		kfree(fsg->buffhds[i].buf);
 
 	/* Free the request and buffer for endpoint 0 */
 	if (req) {
-		if (req->buf)
-			usb_ep_free_buffer(fsg->ep0, req->buf,
-					req->dma, EP0_BUFSIZE);
+		kfree(req->buf);
 		usb_ep_free_request(fsg->ep0, req);
 	}
 
@@ -3972,8 +3965,7 @@ static int __init fsg_bind(struct usb_gadget *gadget)
 	fsg->ep0req = req = usb_ep_alloc_request(fsg->ep0, GFP_KERNEL);
 	if (!req)
 		goto out;
-	req->buf = usb_ep_alloc_buffer(fsg->ep0, EP0_BUFSIZE,
-			&req->dma, GFP_KERNEL);
+	req->buf = kmalloc(EP0_BUFSIZE, GFP_KERNEL);
 	if (!req->buf)
 		goto out;
 	req->complete = ep0_complete;
@@ -3985,8 +3977,7 @@ static int __init fsg_bind(struct usb_gadget *gadget)
 		/* Allocate for the bulk-in endpoint.  We assume that
 		 * the buffer will also work with the bulk-out (and
 		 * interrupt-in) endpoint. */
-		bh->buf = usb_ep_alloc_buffer(fsg->bulk_in, mod_data.buflen,
-				&bh->dma, GFP_KERNEL);
+		bh->buf = kmalloc(mod_data.buflen, GFP_KERNEL);
 		if (!bh->buf)
 			goto out;
 		bh->next = bh + 1;
diff --git a/drivers/usb/gadget/gmidi.c b/drivers/usb/gadget/gmidi.c
index d08a8d0e6427..1c5aa49d7432 100644
--- a/drivers/usb/gadget/gmidi.c
+++ b/drivers/usb/gadget/gmidi.c
@@ -1248,17 +1248,11 @@ autoconf_fail:
 	tasklet_init(&dev->tasklet, gmidi_in_tasklet, (unsigned long)dev);
 
 	/* preallocate control response and buffer */
-	dev->req = usb_ep_alloc_request(gadget->ep0, GFP_KERNEL);
+	dev->req = alloc_ep_req(gadget->ep0, USB_BUFSIZ);
 	if (!dev->req) {
 		err = -ENOMEM;
 		goto fail;
 	}
-	dev->req->buf = usb_ep_alloc_buffer(gadget->ep0, USB_BUFSIZ,
-				&dev->req->dma, GFP_KERNEL);
-	if (!dev->req->buf) {
-		err = -ENOMEM;
-		goto fail;
-	}
 
 	dev->req->complete = gmidi_setup_complete;
 
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index f723e083c9d0..e60745ffaf8e 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -923,7 +923,7 @@ static void clean_req (struct usb_ep *ep, struct usb_request *req)
 	struct dev_data		*dev = ep->driver_data;
 
 	if (req->buf != dev->rbuf) {
-		usb_ep_free_buffer (ep, req->buf, req->dma, req->length);
+		kfree(req->buf);
 		req->buf = dev->rbuf;
 		req->dma = DMA_ADDR_INVALID;
 	}
@@ -963,7 +963,7 @@ static int setup_req (struct usb_ep *ep, struct usb_request *req, u16 len)
 		return -EBUSY;
 	}
 	if (len > sizeof (dev->rbuf))
-		req->buf = usb_ep_alloc_buffer (ep, len, &req->dma, GFP_ATOMIC);
+		req->buf = kmalloc(len, GFP_ATOMIC);
 	if (req->buf == 0) {
 		req->buf = dev->rbuf;
 		return -ENOMEM;
diff --git a/drivers/usb/gadget/zero.c b/drivers/usb/gadget/zero.c
index 7078374d0b79..a2e6e3fc8c8d 100644
--- a/drivers/usb/gadget/zero.c
+++ b/drivers/usb/gadget/zero.c
@@ -481,8 +481,7 @@ alloc_ep_req (struct usb_ep *ep, unsigned length)
 	req = usb_ep_alloc_request (ep, GFP_ATOMIC);
 	if (req) {
 		req->length = length;
-		req->buf = usb_ep_alloc_buffer (ep, length,
-				&req->dma, GFP_ATOMIC);
+		req->buf = kmalloc(length, GFP_ATOMIC);
 		if (!req->buf) {
 			usb_ep_free_request (ep, req);
 			req = NULL;
@@ -493,8 +492,7 @@ alloc_ep_req (struct usb_ep *ep, unsigned length)
 
 static void free_ep_req (struct usb_ep *ep, struct usb_request *req)
 {
-	if (req->buf)
-		usb_ep_free_buffer (ep, req->buf, req->dma, req->length);
+	kfree(req->buf);
 	usb_ep_free_request (ep, req);
 }
 
@@ -1199,8 +1197,7 @@ autoconf_fail:
 	dev->req = usb_ep_alloc_request (gadget->ep0, GFP_KERNEL);
 	if (!dev->req)
 		goto enomem;
-	dev->req->buf = usb_ep_alloc_buffer (gadget->ep0, USB_BUFSIZ,
-				&dev->req->dma, GFP_KERNEL);
+	dev->req->buf = kmalloc(USB_BUFSIZ, GFP_KERNEL);
 	if (!dev->req->buf)
 		goto enomem;
 
diff --git a/include/linux/usb_gadget.h b/include/linux/usb_gadget.h
index e17186dbcdca..703fd84c46fc 100644
--- a/include/linux/usb_gadget.h
+++ b/include/linux/usb_gadget.h
@@ -234,47 +234,6 @@ usb_ep_free_request (struct usb_ep *ep, struct usb_request *req)
 	ep->ops->free_request (ep, req);
 }
 
-/**
- * usb_ep_alloc_buffer - allocate an I/O buffer
- * @ep:the endpoint associated with the buffer
- * @len:length of the desired buffer
- * @dma:pointer to the buffer's DMA address; must be valid
- * @gfp_flags:GFP_* flags to use
- *
- * Returns a new buffer, or null if one could not be allocated.
- * The buffer is suitably aligned for dma, if that endpoint uses DMA,
- * and the caller won't have to care about dma-inconsistency
- * or any hidden "bounce buffer" mechanism.  No additional per-request
- * DMA mapping will be required for such buffers.
- * Free it later with usb_ep_free_buffer().
- *
- * You don't need to use this call to allocate I/O buffers unless you
- * want to make sure drivers don't incur costs for such "bounce buffer"
- * copies or per-request DMA mappings.
- */
-static inline void *
-usb_ep_alloc_buffer (struct usb_ep *ep, unsigned len, dma_addr_t *dma,
-	gfp_t gfp_flags)
-{
-	return ep->ops->alloc_buffer (ep, len, dma, gfp_flags);
-}
-
-/**
- * usb_ep_free_buffer - frees an i/o buffer
- * @ep:the endpoint associated with the buffer
- * @buf:CPU view address of the buffer
- * @dma:the buffer's DMA address
- * @len:length of the buffer
- *
- * reverses the effect of usb_ep_alloc_buffer().
- * caller guarantees the buffer will no longer be accessed
- */
-static inline void
-usb_ep_free_buffer (struct usb_ep *ep, void *buf, dma_addr_t dma, unsigned len)
-{
-	ep->ops->free_buffer (ep, buf, dma, len);
-}
-
 /**
  * usb_ep_queue - queues (submits) an I/O request to an endpoint.
  * @ep:the endpoint associated with the request
-- 
cgit v1.3-8-gc7d7


From c67ab134ba9f83f9de86e58adfeaa14a9efa6e00 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Sun, 1 Jul 2007 12:21:00 -0700
Subject: usb gadget stack: remove usb_ep_*_buffer(), part 2

This patch removes controller driver infrastructure which supported
the now-removed usb_ep_{alloc,free}_buffer() calls.

As can be seen, many of the implementations of this were broken to
various degrees.  Many didn't properly return dma-coherent mappings;
those which did so were necessarily ugly because of bogosity in the
underlying dma_free_coherent() calls ... which on many platforms
can't be called from the same contexts (notably in_irq) from which
their dma_alloc_coherent() sibling can be called.

The main potential downside of removing this is that gadget drivers
wouldn't have specific knowledge that the controller drivers have:
endpoints that aren't dma-capable don't need any dma mappings at all.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/gadget/at91_udc.c     |  21 --------
 drivers/usb/gadget/dummy_hcd.c    |  36 -------------
 drivers/usb/gadget/fsl_usb2_udc.c |  36 -------------
 drivers/usb/gadget/goku_udc.c     |  48 -----------------
 drivers/usb/gadget/lh7a40x_udc.c  |  27 ----------
 drivers/usb/gadget/net2280.c      |  97 ----------------------------------
 drivers/usb/gadget/omap_udc.c     | 108 --------------------------------------
 drivers/usb/gadget/pxa2xx_udc.c   |  23 --------
 drivers/usb/gadget/s3c2410_udc.c  |  33 ------------
 include/linux/usb_gadget.h        |   7 ---
 10 files changed, 436 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/at91_udc.c b/drivers/usb/gadget/at91_udc.c
index ba163f35bf21..63d7d6568699 100644
--- a/drivers/usb/gadget/at91_udc.c
+++ b/drivers/usb/gadget/at91_udc.c
@@ -601,25 +601,6 @@ static void at91_ep_free_request(struct usb_ep *_ep, struct usb_request *_req)
 	kfree(req);
 }
 
-static void *at91_ep_alloc_buffer(
-	struct usb_ep *_ep,
-	unsigned bytes,
-	dma_addr_t *dma,
-	gfp_t gfp_flags)
-{
-	*dma = ~0;
-	return kmalloc(bytes, gfp_flags);
-}
-
-static void at91_ep_free_buffer(
-	struct usb_ep *ep,
-	void *buf,
-	dma_addr_t dma,
-	unsigned bytes)
-{
-	kfree(buf);
-}
-
 static int at91_ep_queue(struct usb_ep *_ep,
 			struct usb_request *_req, gfp_t gfp_flags)
 {
@@ -788,8 +769,6 @@ static const struct usb_ep_ops at91_ep_ops = {
 	.disable	= at91_ep_disable,
 	.alloc_request	= at91_ep_alloc_request,
 	.free_request	= at91_ep_free_request,
-	.alloc_buffer	= at91_ep_alloc_buffer,
-	.free_buffer	= at91_ep_free_buffer,
 	.queue		= at91_ep_queue,
 	.dequeue	= at91_ep_dequeue,
 	.set_halt	= at91_ep_set_halt,
diff --git a/drivers/usb/gadget/dummy_hcd.c b/drivers/usb/gadget/dummy_hcd.c
index 9040b50d6425..f2fbdc7fe376 100644
--- a/drivers/usb/gadget/dummy_hcd.c
+++ b/drivers/usb/gadget/dummy_hcd.c
@@ -497,38 +497,6 @@ dummy_free_request (struct usb_ep *_ep, struct usb_request *_req)
 	kfree (req);
 }
 
-static void *
-dummy_alloc_buffer (
-	struct usb_ep *_ep,
-	unsigned bytes,
-	dma_addr_t *dma,
-	gfp_t mem_flags
-) {
-	char			*retval;
-	struct dummy_ep		*ep;
-	struct dummy		*dum;
-
-	ep = usb_ep_to_dummy_ep (_ep);
-	dum = ep_to_dummy (ep);
-
-	if (!dum->driver)
-		return NULL;
-	retval = kmalloc (bytes, mem_flags);
-	*dma = (dma_addr_t) retval;
-	return retval;
-}
-
-static void
-dummy_free_buffer (
-	struct usb_ep *_ep,
-	void *buf,
-	dma_addr_t dma,
-	unsigned bytes
-) {
-	if (bytes)
-		kfree (buf);
-}
-
 static void
 fifo_complete (struct usb_ep *ep, struct usb_request *req)
 {
@@ -659,10 +627,6 @@ static const struct usb_ep_ops dummy_ep_ops = {
 	.alloc_request	= dummy_alloc_request,
 	.free_request	= dummy_free_request,
 
-	.alloc_buffer	= dummy_alloc_buffer,
-	.free_buffer	= dummy_free_buffer,
-	/* map, unmap, ... eventually hook the "generic" dma calls */
-
 	.queue		= dummy_queue,
 	.dequeue	= dummy_dequeue,
 
diff --git a/drivers/usb/gadget/fsl_usb2_udc.c b/drivers/usb/gadget/fsl_usb2_udc.c
index 4e14bcd7c3b1..10b2b33b8698 100644
--- a/drivers/usb/gadget/fsl_usb2_udc.c
+++ b/drivers/usb/gadget/fsl_usb2_udc.c
@@ -601,39 +601,6 @@ static void fsl_free_request(struct usb_ep *_ep, struct usb_request *_req)
 		kfree(req);
 }
 
-/*------------------------------------------------------------------
- * Allocate an I/O buffer
-*---------------------------------------------------------------------*/
-static void *fsl_alloc_buffer(struct usb_ep *_ep, unsigned bytes,
-		dma_addr_t *dma, gfp_t gfp_flags)
-{
-	struct fsl_ep *ep;
-
-	if (!_ep)
-		return NULL;
-
-	ep = container_of(_ep, struct fsl_ep, ep);
-
-	return dma_alloc_coherent(ep->udc->gadget.dev.parent,
-			bytes, dma, gfp_flags);
-}
-
-/*------------------------------------------------------------------
- * frees an i/o buffer
-*---------------------------------------------------------------------*/
-static void fsl_free_buffer(struct usb_ep *_ep, void *buf,
-		dma_addr_t dma, unsigned bytes)
-{
-	struct fsl_ep *ep;
-
-	if (!_ep)
-		return;
-
-	ep = container_of(_ep, struct fsl_ep, ep);
-
-	dma_free_coherent(ep->udc->gadget.dev.parent, bytes, buf, dma);
-}
-
 /*-------------------------------------------------------------------------*/
 static int fsl_queue_td(struct fsl_ep *ep, struct fsl_req *req)
 {
@@ -1047,9 +1014,6 @@ static struct usb_ep_ops fsl_ep_ops = {
 	.alloc_request = fsl_alloc_request,
 	.free_request = fsl_free_request,
 
-	.alloc_buffer = fsl_alloc_buffer,
-	.free_buffer = fsl_free_buffer,
-
 	.queue = fsl_ep_queue,
 	.dequeue = fsl_ep_dequeue,
 
diff --git a/drivers/usb/gadget/goku_udc.c b/drivers/usb/gadget/goku_udc.c
index ae931af05cef..dfadb643597b 100644
--- a/drivers/usb/gadget/goku_udc.c
+++ b/drivers/usb/gadget/goku_udc.c
@@ -296,51 +296,6 @@ goku_free_request(struct usb_ep *_ep, struct usb_request *_req)
 
 /*-------------------------------------------------------------------------*/
 
-/* allocating buffers this way eliminates dma mapping overhead, which
- * on some platforms will mean eliminating a per-io buffer copy.  with
- * some kinds of system caches, further tweaks may still be needed.
- */
-static void *
-goku_alloc_buffer(struct usb_ep *_ep, unsigned bytes,
-			dma_addr_t *dma, gfp_t gfp_flags)
-{
-	void		*retval;
-	struct goku_ep	*ep;
-
-	ep = container_of(_ep, struct goku_ep, ep);
-	if (!_ep)
-		return NULL;
-	*dma = DMA_ADDR_INVALID;
-
-	if (ep->dma) {
-		/* the main problem with this call is that it wastes memory
-		 * on typical 1/N page allocations: it allocates 1-N pages.
-		 */
-#warning Using dma_alloc_coherent even with buffers smaller than a page.
-		retval = dma_alloc_coherent(&ep->dev->pdev->dev,
-				bytes, dma, gfp_flags);
-	} else
-		retval = kmalloc(bytes, gfp_flags);
-	return retval;
-}
-
-static void
-goku_free_buffer(struct usb_ep *_ep, void *buf, dma_addr_t dma, unsigned bytes)
-{
-	/* free memory into the right allocator */
-	if (dma != DMA_ADDR_INVALID) {
-		struct goku_ep	*ep;
-
-		ep = container_of(_ep, struct goku_ep, ep);
-		if (!_ep)
-			return;
-		dma_free_coherent(&ep->dev->pdev->dev, bytes, buf, dma);
-	} else
-		kfree (buf);
-}
-
-/*-------------------------------------------------------------------------*/
-
 static void
 done(struct goku_ep *ep, struct goku_request *req, int status)
 {
@@ -1026,9 +981,6 @@ static struct usb_ep_ops goku_ep_ops = {
 	.alloc_request	= goku_alloc_request,
 	.free_request	= goku_free_request,
 
-	.alloc_buffer	= goku_alloc_buffer,
-	.free_buffer	= goku_free_buffer,
-
 	.queue		= goku_queue,
 	.dequeue	= goku_dequeue,
 
diff --git a/drivers/usb/gadget/lh7a40x_udc.c b/drivers/usb/gadget/lh7a40x_udc.c
index a0a73c08a344..e78c2ddc1f88 100644
--- a/drivers/usb/gadget/lh7a40x_udc.c
+++ b/drivers/usb/gadget/lh7a40x_udc.c
@@ -75,10 +75,6 @@ static int lh7a40x_ep_enable(struct usb_ep *ep,
 static int lh7a40x_ep_disable(struct usb_ep *ep);
 static struct usb_request *lh7a40x_alloc_request(struct usb_ep *ep, gfp_t);
 static void lh7a40x_free_request(struct usb_ep *ep, struct usb_request *);
-static void *lh7a40x_alloc_buffer(struct usb_ep *ep, unsigned, dma_addr_t *,
-				  gfp_t);
-static void lh7a40x_free_buffer(struct usb_ep *ep, void *, dma_addr_t,
-				unsigned);
 static int lh7a40x_queue(struct usb_ep *ep, struct usb_request *, gfp_t);
 static int lh7a40x_dequeue(struct usb_ep *ep, struct usb_request *);
 static int lh7a40x_set_halt(struct usb_ep *ep, int);
@@ -104,9 +100,6 @@ static struct usb_ep_ops lh7a40x_ep_ops = {
 	.alloc_request = lh7a40x_alloc_request,
 	.free_request = lh7a40x_free_request,
 
-	.alloc_buffer = lh7a40x_alloc_buffer,
-	.free_buffer = lh7a40x_free_buffer,
-
 	.queue = lh7a40x_queue,
 	.dequeue = lh7a40x_dequeue,
 
@@ -1134,26 +1127,6 @@ static void lh7a40x_free_request(struct usb_ep *ep, struct usb_request *_req)
 	kfree(req);
 }
 
-static void *lh7a40x_alloc_buffer(struct usb_ep *ep, unsigned bytes,
-				  dma_addr_t * dma, gfp_t gfp_flags)
-{
-	char *retval;
-
-	DEBUG("%s (%p, %d, %d)\n", __FUNCTION__, ep, bytes, gfp_flags);
-
-	retval = kmalloc(bytes, gfp_flags & ~(__GFP_DMA | __GFP_HIGHMEM));
-	if (retval)
-		*dma = virt_to_bus(retval);
-	return retval;
-}
-
-static void lh7a40x_free_buffer(struct usb_ep *ep, void *buf, dma_addr_t dma,
-				unsigned bytes)
-{
-	DEBUG("%s, %p\n", __FUNCTION__, ep);
-	kfree(buf);
-}
-
 /** Queue one request
  *  Kickstart transfer if needed
  *  NOTE: Sets INDEX register
diff --git a/drivers/usb/gadget/net2280.c b/drivers/usb/gadget/net2280.c
index 00fda334dc72..c3d364ecd4f8 100644
--- a/drivers/usb/gadget/net2280.c
+++ b/drivers/usb/gadget/net2280.c
@@ -450,100 +450,6 @@ net2280_free_request (struct usb_ep *_ep, struct usb_request *_req)
 
 /*-------------------------------------------------------------------------*/
 
-/*
- * dma-coherent memory allocation (for dma-capable endpoints)
- *
- * NOTE: the dma_*_coherent() API calls suck.  Most implementations are
- * (a) page-oriented, so small buffers lose big; and (b) asymmetric with
- * respect to calls with irqs disabled:  alloc is safe, free is not.
- * We currently work around (b), but not (a).
- */
-
-static void *
-net2280_alloc_buffer (
-	struct usb_ep		*_ep,
-	unsigned		bytes,
-	dma_addr_t		*dma,
-	gfp_t			gfp_flags
-)
-{
-	void			*retval;
-	struct net2280_ep	*ep;
-
-	ep = container_of (_ep, struct net2280_ep, ep);
-	if (!_ep)
-		return NULL;
-	*dma = DMA_ADDR_INVALID;
-
-	if (ep->dma)
-		retval = dma_alloc_coherent(&ep->dev->pdev->dev,
-				bytes, dma, gfp_flags);
-	else
-		retval = kmalloc(bytes, gfp_flags);
-	return retval;
-}
-
-static DEFINE_SPINLOCK(buflock);
-static LIST_HEAD(buffers);
-
-struct free_record {
-	struct list_head	list;
-	struct device		*dev;
-	unsigned		bytes;
-	dma_addr_t		dma;
-};
-
-static void do_free(unsigned long ignored)
-{
-	spin_lock_irq(&buflock);
-	while (!list_empty(&buffers)) {
-		struct free_record	*buf;
-
-		buf = list_entry(buffers.next, struct free_record, list);
-		list_del(&buf->list);
-		spin_unlock_irq(&buflock);
-
-		dma_free_coherent(buf->dev, buf->bytes, buf, buf->dma);
-
-		spin_lock_irq(&buflock);
-	}
-	spin_unlock_irq(&buflock);
-}
-
-static DECLARE_TASKLET(deferred_free, do_free, 0);
-
-static void
-net2280_free_buffer (
-	struct usb_ep *_ep,
-	void *address,
-	dma_addr_t dma,
-	unsigned bytes
-) {
-	/* free memory into the right allocator */
-	if (dma != DMA_ADDR_INVALID) {
-		struct net2280_ep	*ep;
-		struct free_record	*buf = address;
-		unsigned long		flags;
-
-		ep = container_of(_ep, struct net2280_ep, ep);
-		if (!_ep)
-			return;
-
-		ep = container_of (_ep, struct net2280_ep, ep);
-		buf->dev = &ep->dev->pdev->dev;
-		buf->bytes = bytes;
-		buf->dma = dma;
-
-		spin_lock_irqsave(&buflock, flags);
-		list_add_tail(&buf->list, &buffers);
-		tasklet_schedule(&deferred_free);
-		spin_unlock_irqrestore(&buflock, flags);
-	} else
-		kfree (address);
-}
-
-/*-------------------------------------------------------------------------*/
-
 /* load a packet into the fifo we use for usb IN transfers.
  * works for all endpoints.
  *
@@ -1392,9 +1298,6 @@ static const struct usb_ep_ops net2280_ep_ops = {
 	.alloc_request	= net2280_alloc_request,
 	.free_request	= net2280_free_request,
 
-	.alloc_buffer	= net2280_alloc_buffer,
-	.free_buffer	= net2280_free_buffer,
-
 	.queue		= net2280_queue,
 	.dequeue	= net2280_dequeue,
 
diff --git a/drivers/usb/gadget/omap_udc.c b/drivers/usb/gadget/omap_udc.c
index c4975a6cf777..9b0f0925dddf 100644
--- a/drivers/usb/gadget/omap_udc.c
+++ b/drivers/usb/gadget/omap_udc.c
@@ -296,111 +296,6 @@ omap_free_request(struct usb_ep *ep, struct usb_request *_req)
 
 /*-------------------------------------------------------------------------*/
 
-/*
- * dma-coherent memory allocation (for dma-capable endpoints)
- *
- * NOTE: the dma_*_coherent() API calls suck.  Most implementations are
- * (a) page-oriented, so small buffers lose big; and (b) asymmetric with
- * respect to calls with irqs disabled:  alloc is safe, free is not.
- * We currently work around (b), but not (a).
- */
-
-static void *
-omap_alloc_buffer(
-	struct usb_ep	*_ep,
-	unsigned	bytes,
-	dma_addr_t	*dma,
-	gfp_t		gfp_flags
-)
-{
-	void		*retval;
-	struct omap_ep	*ep;
-
-	if (!_ep)
-		return NULL;
-
-	ep = container_of(_ep, struct omap_ep, ep);
-	if (use_dma && ep->has_dma) {
-		static int	warned;
-		if (!warned && bytes < PAGE_SIZE) {
-			dev_warn(ep->udc->gadget.dev.parent,
-				"using dma_alloc_coherent for "
-				"small allocations wastes memory\n");
-			warned++;
-		}
-		return dma_alloc_coherent(ep->udc->gadget.dev.parent,
-				bytes, dma, gfp_flags);
-	}
-
-	retval = kmalloc(bytes, gfp_flags);
-	if (retval)
-		*dma = virt_to_phys(retval);
-	return retval;
-}
-
-static DEFINE_SPINLOCK(buflock);
-static LIST_HEAD(buffers);
-
-struct free_record {
-	struct list_head	list;
-	struct device		*dev;
-	unsigned		bytes;
-	dma_addr_t		dma;
-};
-
-static void do_free(unsigned long ignored)
-{
-	spin_lock_irq(&buflock);
-	while (!list_empty(&buffers)) {
-		struct free_record	*buf;
-
-		buf = list_entry(buffers.next, struct free_record, list);
-		list_del(&buf->list);
-		spin_unlock_irq(&buflock);
-
-		dma_free_coherent(buf->dev, buf->bytes, buf, buf->dma);
-
-		spin_lock_irq(&buflock);
-	}
-	spin_unlock_irq(&buflock);
-}
-
-static DECLARE_TASKLET(deferred_free, do_free, 0);
-
-static void omap_free_buffer(
-	struct usb_ep	*_ep,
-	void		*buf,
-	dma_addr_t	dma,
-	unsigned	bytes
-)
-{
-	if (!_ep) {
-		WARN_ON(1);
-		return;
-	}
-
-	/* free memory into the right allocator */
-	if (dma != DMA_ADDR_INVALID) {
-		struct omap_ep		*ep;
-		struct free_record	*rec = buf;
-		unsigned long		flags;
-
-		ep = container_of(_ep, struct omap_ep, ep);
-
-		rec->dev = ep->udc->gadget.dev.parent;
-		rec->bytes = bytes;
-		rec->dma = dma;
-
-		spin_lock_irqsave(&buflock, flags);
-		list_add_tail(&rec->list, &buffers);
-		tasklet_schedule(&deferred_free);
-		spin_unlock_irqrestore(&buflock, flags);
-	} else
-		kfree(buf);
-}
-
-/*-------------------------------------------------------------------------*/
-
 static void
 done(struct omap_ep *ep, struct omap_req *req, int status)
 {
@@ -1271,9 +1166,6 @@ static struct usb_ep_ops omap_ep_ops = {
 	.alloc_request	= omap_alloc_request,
 	.free_request	= omap_free_request,
 
-	.alloc_buffer	= omap_alloc_buffer,
-	.free_buffer	= omap_free_buffer,
-
 	.queue		= omap_ep_queue,
 	.dequeue	= omap_ep_dequeue,
 
diff --git a/drivers/usb/gadget/pxa2xx_udc.c b/drivers/usb/gadget/pxa2xx_udc.c
index 484de6e97662..63b9521c1322 100644
--- a/drivers/usb/gadget/pxa2xx_udc.c
+++ b/drivers/usb/gadget/pxa2xx_udc.c
@@ -341,26 +341,6 @@ pxa2xx_ep_free_request (struct usb_ep *_ep, struct usb_request *_req)
 	kfree(req);
 }
 
-
-static void *
-pxa2xx_ep_alloc_buffer(struct usb_ep *_ep, unsigned bytes,
-	dma_addr_t *dma, gfp_t gfp_flags)
-{
-	char			*retval;
-
-	retval = kmalloc (bytes, gfp_flags & ~(__GFP_DMA|__GFP_HIGHMEM));
-	if (retval)
-		*dma = (dma_addr_t)~0;
-	return retval;
-}
-
-static void
-pxa2xx_ep_free_buffer(struct usb_ep *_ep, void *buf, dma_addr_t dma,
-		unsigned bytes)
-{
-	kfree (buf);
-}
-
 /*-------------------------------------------------------------------------*/
 
 /*
@@ -927,9 +907,6 @@ static struct usb_ep_ops pxa2xx_ep_ops = {
 	.alloc_request	= pxa2xx_ep_alloc_request,
 	.free_request	= pxa2xx_ep_free_request,
 
-	.alloc_buffer	= pxa2xx_ep_alloc_buffer,
-	.free_buffer	= pxa2xx_ep_free_buffer,
-
 	.queue		= pxa2xx_ep_queue,
 	.dequeue	= pxa2xx_ep_dequeue,
 
diff --git a/drivers/usb/gadget/s3c2410_udc.c b/drivers/usb/gadget/s3c2410_udc.c
index d60748add325..0be80c635c48 100644
--- a/drivers/usb/gadget/s3c2410_udc.c
+++ b/drivers/usb/gadget/s3c2410_udc.c
@@ -1196,36 +1196,6 @@ s3c2410_udc_free_request(struct usb_ep *_ep, struct usb_request *_req)
 	kfree(req);
 }
 
-/*
- *	s3c2410_udc_alloc_buffer
- */
-static void *s3c2410_udc_alloc_buffer(struct usb_ep *_ep,
-		unsigned bytes, dma_addr_t *dma, gfp_t mem_flags)
-{
-	char *retval;
-
-	dprintk(DEBUG_VERBOSE, "%s()\n", __func__);
-
-	if (!the_controller->driver)
-		return NULL;
-
-	retval = kmalloc (bytes, mem_flags);
-	*dma = (dma_addr_t) retval;
-	return retval;
-}
-
-/*
- * s3c2410_udc_free_buffer
- */
-static void s3c2410_udc_free_buffer (struct usb_ep *_ep, void *buf,
-		dma_addr_t dma, unsigned bytes)
-{
-	dprintk(DEBUG_VERBOSE, "%s()\n", __func__);
-
-	if (bytes)
-		kfree (buf);
-}
-
 /*
  *	s3c2410_udc_queue
  */
@@ -1441,9 +1411,6 @@ static const struct usb_ep_ops s3c2410_ep_ops = {
 	.alloc_request	= s3c2410_udc_alloc_request,
 	.free_request	= s3c2410_udc_free_request,
 
-	.alloc_buffer	= s3c2410_udc_alloc_buffer,
-	.free_buffer	= s3c2410_udc_free_buffer,
-
 	.queue		= s3c2410_udc_queue,
 	.dequeue	= s3c2410_udc_dequeue,
 
diff --git a/include/linux/usb_gadget.h b/include/linux/usb_gadget.h
index 703fd84c46fc..4f59b2aa8a9e 100644
--- a/include/linux/usb_gadget.h
+++ b/include/linux/usb_gadget.h
@@ -110,13 +110,6 @@ struct usb_ep_ops {
 		gfp_t gfp_flags);
 	void (*free_request) (struct usb_ep *ep, struct usb_request *req);
 
-	void *(*alloc_buffer) (struct usb_ep *ep, unsigned bytes,
-		dma_addr_t *dma, gfp_t gfp_flags);
-	void (*free_buffer) (struct usb_ep *ep, void *buf, dma_addr_t dma,
-		unsigned bytes);
-	// NOTE:  on 2.6, drivers may also use dma_map() and
-	// dma_sync_single_*() to directly manage dma overhead. 
-
 	int (*queue) (struct usb_ep *ep, struct usb_request *req,
 		gfp_t gfp_flags);
 	int (*dequeue) (struct usb_ep *ep, struct usb_request *req);
-- 
cgit v1.3-8-gc7d7


From 7405f74badf46b5d023c5d2b670b4471525f6c91 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 2 Jan 2007 11:10:43 -0700
Subject: dmaengine: refactor dmaengine around dma_async_tx_descriptor

The current dmaengine interface defines mutliple routines per operation,
i.e. dma_async_memcpy_buf_to_buf, dma_async_memcpy_buf_to_page etc.  Adding
more operation types (xor, crc, etc) to this model would result in an
unmanageable number of method permutations.

	Are we really going to add a set of hooks for each DMA engine
	whizbang feature?
		- Jeff Garzik

The descriptor creation process is refactored using the new common
dma_async_tx_descriptor structure.  Instead of per driver
do_<operation>_<dest>_to_<src> methods, drivers integrate
dma_async_tx_descriptor into their private software descriptor and then
define a 'prep' routine per operation.  The prep routine allocates a
descriptor and ensures that the tx_set_src, tx_set_dest, tx_submit routines
are valid.  Descriptor creation and submission becomes:

struct dma_device *dev;
struct dma_chan *chan;
struct dma_async_tx_descriptor *tx;

tx = dev->device_prep_dma_<operation>(chan, len, int_flag)
tx->tx_set_src(dma_addr_t, tx, index /* for multi-source ops */)
tx->tx_set_dest(dma_addr_t, tx, index)
tx->tx_submit(tx)

In addition to the refactoring, dma_async_tx_descriptor also lays the
groundwork for definining cross-channel-operation dependencies, and a
callback facility for asynchronous notification of operation completion.

Changelog:
* drop dma mapping methods, suggested by Chris Leech
* fix ioat_dma_dependency_added, also caught by Andrew Morton
* fix dma_sync_wait, change from Andrew Morton
* uninline large functions, change from Andrew Morton
* add tx->callback = NULL to dmaengine calls to interoperate with async_tx
  calls
* hookup ioat_tx_submit
* convert channel capabilities to a 'cpumask_t like' bitmap
* removed DMA_TX_ARRAY_INIT, no longer needed
* checkpatch.pl fixes
* make set_src, set_dest, and tx_submit descriptor specific methods
* fixup git-ioat merge
* move group_list and phys to dma_async_tx_descriptor

Cc: Jeff Garzik <jeff@garzik.org>
Cc: Chris Leech <christopher.leech@intel.com>
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 drivers/dma/dmaengine.c   | 182 ++++++++++++++++++++++++++++
 drivers/dma/ioatdma.c     | 295 +++++++++++++++++++++-------------------------
 drivers/dma/ioatdma.h     |  13 +-
 include/linux/dmaengine.h | 237 +++++++++++++++++++++++--------------
 4 files changed, 474 insertions(+), 253 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 828310d8be80..404cc7b6e705 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -59,6 +59,7 @@
 
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/mm.h>
 #include <linux/device.h>
 #include <linux/dmaengine.h>
 #include <linux/hardirq.h>
@@ -66,6 +67,7 @@
 #include <linux/percpu.h>
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
+#include <linux/jiffies.h>
 
 static DEFINE_MUTEX(dma_list_mutex);
 static LIST_HEAD(dma_device_list);
@@ -165,6 +167,24 @@ static struct dma_chan *dma_client_chan_alloc(struct dma_client *client)
 	return NULL;
 }
 
+enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie)
+{
+	enum dma_status status;
+	unsigned long dma_sync_wait_timeout = jiffies + msecs_to_jiffies(5000);
+
+	dma_async_issue_pending(chan);
+	do {
+		status = dma_async_is_tx_complete(chan, cookie, NULL, NULL);
+		if (time_after_eq(jiffies, dma_sync_wait_timeout)) {
+			printk(KERN_ERR "dma_sync_wait_timeout!\n");
+			return DMA_ERROR;
+		}
+	} while (status == DMA_IN_PROGRESS);
+
+	return status;
+}
+EXPORT_SYMBOL(dma_sync_wait);
+
 /**
  * dma_chan_cleanup - release a DMA channel's resources
  * @kref: kernel reference structure that contains the DMA channel device
@@ -322,6 +342,25 @@ int dma_async_device_register(struct dma_device *device)
 	if (!device)
 		return -ENODEV;
 
+	/* validate device routines */
+	BUG_ON(dma_has_cap(DMA_MEMCPY, device->cap_mask) &&
+		!device->device_prep_dma_memcpy);
+	BUG_ON(dma_has_cap(DMA_XOR, device->cap_mask) &&
+		!device->device_prep_dma_xor);
+	BUG_ON(dma_has_cap(DMA_ZERO_SUM, device->cap_mask) &&
+		!device->device_prep_dma_zero_sum);
+	BUG_ON(dma_has_cap(DMA_MEMSET, device->cap_mask) &&
+		!device->device_prep_dma_memset);
+	BUG_ON(dma_has_cap(DMA_ZERO_SUM, device->cap_mask) &&
+		!device->device_prep_dma_interrupt);
+
+	BUG_ON(!device->device_alloc_chan_resources);
+	BUG_ON(!device->device_free_chan_resources);
+	BUG_ON(!device->device_dependency_added);
+	BUG_ON(!device->device_is_tx_complete);
+	BUG_ON(!device->device_issue_pending);
+	BUG_ON(!device->dev);
+
 	init_completion(&device->done);
 	kref_init(&device->refcount);
 	device->dev_id = id++;
@@ -415,6 +454,149 @@ void dma_async_device_unregister(struct dma_device *device)
 }
 EXPORT_SYMBOL(dma_async_device_unregister);
 
+/**
+ * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses
+ * @chan: DMA channel to offload copy to
+ * @dest: destination address (virtual)
+ * @src: source address (virtual)
+ * @len: length
+ *
+ * Both @dest and @src must be mappable to a bus address according to the
+ * DMA mapping API rules for streaming mappings.
+ * Both @dest and @src must stay memory resident (kernel memory or locked
+ * user space pages).
+ */
+dma_cookie_t
+dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest,
+			void *src, size_t len)
+{
+	struct dma_device *dev = chan->device;
+	struct dma_async_tx_descriptor *tx;
+	dma_addr_t addr;
+	dma_cookie_t cookie;
+	int cpu;
+
+	tx = dev->device_prep_dma_memcpy(chan, len, 0);
+	if (!tx)
+		return -ENOMEM;
+
+	tx->ack = 1;
+	tx->callback = NULL;
+	addr = dma_map_single(dev->dev, src, len, DMA_TO_DEVICE);
+	tx->tx_set_src(addr, tx, 0);
+	addr = dma_map_single(dev->dev, dest, len, DMA_FROM_DEVICE);
+	tx->tx_set_dest(addr, tx, 0);
+	cookie = tx->tx_submit(tx);
+
+	cpu = get_cpu();
+	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+	put_cpu();
+
+	return cookie;
+}
+EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf);
+
+/**
+ * dma_async_memcpy_buf_to_pg - offloaded copy from address to page
+ * @chan: DMA channel to offload copy to
+ * @page: destination page
+ * @offset: offset in page to copy to
+ * @kdata: source address (virtual)
+ * @len: length
+ *
+ * Both @page/@offset and @kdata must be mappable to a bus address according
+ * to the DMA mapping API rules for streaming mappings.
+ * Both @page/@offset and @kdata must stay memory resident (kernel memory or
+ * locked user space pages)
+ */
+dma_cookie_t
+dma_async_memcpy_buf_to_pg(struct dma_chan *chan, struct page *page,
+			unsigned int offset, void *kdata, size_t len)
+{
+	struct dma_device *dev = chan->device;
+	struct dma_async_tx_descriptor *tx;
+	dma_addr_t addr;
+	dma_cookie_t cookie;
+	int cpu;
+
+	tx = dev->device_prep_dma_memcpy(chan, len, 0);
+	if (!tx)
+		return -ENOMEM;
+
+	tx->ack = 1;
+	tx->callback = NULL;
+	addr = dma_map_single(dev->dev, kdata, len, DMA_TO_DEVICE);
+	tx->tx_set_src(addr, tx, 0);
+	addr = dma_map_page(dev->dev, page, offset, len, DMA_FROM_DEVICE);
+	tx->tx_set_dest(addr, tx, 0);
+	cookie = tx->tx_submit(tx);
+
+	cpu = get_cpu();
+	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+	put_cpu();
+
+	return cookie;
+}
+EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg);
+
+/**
+ * dma_async_memcpy_pg_to_pg - offloaded copy from page to page
+ * @chan: DMA channel to offload copy to
+ * @dest_pg: destination page
+ * @dest_off: offset in page to copy to
+ * @src_pg: source page
+ * @src_off: offset in page to copy from
+ * @len: length
+ *
+ * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus
+ * address according to the DMA mapping API rules for streaming mappings.
+ * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident
+ * (kernel memory or locked user space pages).
+ */
+dma_cookie_t
+dma_async_memcpy_pg_to_pg(struct dma_chan *chan, struct page *dest_pg,
+	unsigned int dest_off, struct page *src_pg, unsigned int src_off,
+	size_t len)
+{
+	struct dma_device *dev = chan->device;
+	struct dma_async_tx_descriptor *tx;
+	dma_addr_t addr;
+	dma_cookie_t cookie;
+	int cpu;
+
+	tx = dev->device_prep_dma_memcpy(chan, len, 0);
+	if (!tx)
+		return -ENOMEM;
+
+	tx->ack = 1;
+	tx->callback = NULL;
+	addr = dma_map_page(dev->dev, src_pg, src_off, len, DMA_TO_DEVICE);
+	tx->tx_set_src(addr, tx, 0);
+	addr = dma_map_page(dev->dev, dest_pg, dest_off, len, DMA_FROM_DEVICE);
+	tx->tx_set_dest(addr, tx, 0);
+	cookie = tx->tx_submit(tx);
+
+	cpu = get_cpu();
+	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+	put_cpu();
+
+	return cookie;
+}
+EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg);
+
+void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
+	struct dma_chan *chan)
+{
+	tx->chan = chan;
+	spin_lock_init(&tx->lock);
+	INIT_LIST_HEAD(&tx->depend_node);
+	INIT_LIST_HEAD(&tx->depend_list);
+}
+EXPORT_SYMBOL(dma_async_tx_descriptor_init);
+
 static int __init dma_bus_init(void)
 {
 	mutex_init(&dma_list_mutex);
diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index c4209af4fde4..171044930282 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -38,6 +38,7 @@
 #define to_ioat_chan(chan) container_of(chan, struct ioat_dma_chan, common)
 #define to_ioat_device(dev) container_of(dev, struct ioat_device, common)
 #define to_ioat_desc(lh) container_of(lh, struct ioat_desc_sw, node)
+#define tx_to_ioat_desc(tx) container_of(tx, struct ioat_desc_sw, async_tx)
 
 /* internal functions */
 static int __devinit ioat_probe(struct pci_dev *pdev, const struct pci_device_id *ent);
@@ -78,6 +79,73 @@ static int enumerate_dma_channels(struct ioat_device *device)
 	return device->common.chancnt;
 }
 
+static void
+ioat_set_src(dma_addr_t addr, struct dma_async_tx_descriptor *tx, int index)
+{
+	struct ioat_desc_sw *iter, *desc = tx_to_ioat_desc(tx);
+	struct ioat_dma_chan *ioat_chan = to_ioat_chan(tx->chan);
+
+	pci_unmap_addr_set(desc, src, addr);
+
+	list_for_each_entry(iter, &desc->async_tx.tx_list, node) {
+		iter->hw->src_addr = addr;
+		addr += ioat_chan->xfercap;
+	}
+
+}
+
+static void
+ioat_set_dest(dma_addr_t addr, struct dma_async_tx_descriptor *tx, int index)
+{
+	struct ioat_desc_sw *iter, *desc = tx_to_ioat_desc(tx);
+	struct ioat_dma_chan *ioat_chan = to_ioat_chan(tx->chan);
+
+	pci_unmap_addr_set(desc, dst, addr);
+
+	list_for_each_entry(iter, &desc->async_tx.tx_list, node) {
+		iter->hw->dst_addr = addr;
+		addr += ioat_chan->xfercap;
+	}
+}
+
+static dma_cookie_t
+ioat_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+	struct ioat_dma_chan *ioat_chan = to_ioat_chan(tx->chan);
+	struct ioat_desc_sw *desc = tx_to_ioat_desc(tx);
+	int append = 0;
+	dma_cookie_t cookie;
+	struct ioat_desc_sw *group_start;
+
+	group_start = list_entry(desc->async_tx.tx_list.next,
+				 struct ioat_desc_sw, node);
+	spin_lock_bh(&ioat_chan->desc_lock);
+	/* cookie incr and addition to used_list must be atomic */
+	cookie = ioat_chan->common.cookie;
+	cookie++;
+	if (cookie < 0)
+		cookie = 1;
+	ioat_chan->common.cookie = desc->async_tx.cookie = cookie;
+
+	/* write address into NextDescriptor field of last desc in chain */
+	to_ioat_desc(ioat_chan->used_desc.prev)->hw->next =
+						group_start->async_tx.phys;
+	list_splice_init(&desc->async_tx.tx_list, ioat_chan->used_desc.prev);
+
+	ioat_chan->pending += desc->tx_cnt;
+	if (ioat_chan->pending >= 4) {
+		append = 1;
+		ioat_chan->pending = 0;
+	}
+	spin_unlock_bh(&ioat_chan->desc_lock);
+
+	if (append)
+		writeb(IOAT_CHANCMD_APPEND,
+			ioat_chan->reg_base + IOAT_CHANCMD_OFFSET);
+	
+	return cookie;
+}
+
 static struct ioat_desc_sw *ioat_dma_alloc_descriptor(
 	struct ioat_dma_chan *ioat_chan,
 	gfp_t flags)
@@ -99,8 +167,13 @@ static struct ioat_desc_sw *ioat_dma_alloc_descriptor(
 	}
 
 	memset(desc, 0, sizeof(*desc));
+	dma_async_tx_descriptor_init(&desc_sw->async_tx, &ioat_chan->common);
+	desc_sw->async_tx.tx_set_src = ioat_set_src;
+	desc_sw->async_tx.tx_set_dest = ioat_set_dest;
+	desc_sw->async_tx.tx_submit = ioat_tx_submit;
+	INIT_LIST_HEAD(&desc_sw->async_tx.tx_list);
 	desc_sw->hw = desc;
-	desc_sw->phys = phys;
+	desc_sw->async_tx.phys = phys;
 
 	return desc_sw;
 }
@@ -188,12 +261,14 @@ static void ioat_dma_free_chan_resources(struct dma_chan *chan)
 	list_for_each_entry_safe(desc, _desc, &ioat_chan->used_desc, node) {
 		in_use_descs++;
 		list_del(&desc->node);
-		pci_pool_free(ioat_device->dma_pool, desc->hw, desc->phys);
+		pci_pool_free(ioat_device->dma_pool, desc->hw,
+			      desc->async_tx.phys);
 		kfree(desc);
 	}
 	list_for_each_entry_safe(desc, _desc, &ioat_chan->free_desc, node) {
 		list_del(&desc->node);
-		pci_pool_free(ioat_device->dma_pool, desc->hw, desc->phys);
+		pci_pool_free(ioat_device->dma_pool, desc->hw,
+			      desc->async_tx.phys);
 		kfree(desc);
 	}
 	spin_unlock_bh(&ioat_chan->desc_lock);
@@ -215,45 +290,25 @@ static void ioat_dma_free_chan_resources(struct dma_chan *chan)
 	writew(chanctrl, ioat_chan->reg_base + IOAT_CHANCTRL_OFFSET);
 }
 
-/**
- * do_ioat_dma_memcpy - actual function that initiates a IOAT DMA transaction
- * @ioat_chan: IOAT DMA channel handle
- * @dest: DMA destination address
- * @src: DMA source address
- * @len: transaction length in bytes
- */
-
-static dma_cookie_t do_ioat_dma_memcpy(struct ioat_dma_chan *ioat_chan,
-                                       dma_addr_t dest,
-                                       dma_addr_t src,
-                                       size_t len)
+static struct dma_async_tx_descriptor *
+ioat_dma_prep_memcpy(struct dma_chan *chan, size_t len, int int_en)
 {
-	struct ioat_desc_sw *first;
-	struct ioat_desc_sw *prev;
-	struct ioat_desc_sw *new;
-	dma_cookie_t cookie;
+	struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+	struct ioat_desc_sw *first, *prev, *new;
 	LIST_HEAD(new_chain);
 	u32 copy;
 	size_t orig_len;
-	dma_addr_t orig_src, orig_dst;
-	unsigned int desc_count = 0;
-	unsigned int append = 0;
-
-	if (!ioat_chan || !dest || !src)
-		return -EFAULT;
+	int desc_count = 0;
 
 	if (!len)
-		return ioat_chan->common.cookie;
+		return NULL;
 
 	orig_len = len;
-	orig_src = src;
-	orig_dst = dest;
 
 	first = NULL;
 	prev = NULL;
 
 	spin_lock_bh(&ioat_chan->desc_lock);
-
 	while (len) {
 		if (!list_empty(&ioat_chan->free_desc)) {
 			new = to_ioat_desc(ioat_chan->free_desc.next);
@@ -270,140 +325,36 @@ static dma_cookie_t do_ioat_dma_memcpy(struct ioat_dma_chan *ioat_chan,
 
 		new->hw->size = copy;
 		new->hw->ctl = 0;
-		new->hw->src_addr = src;
-		new->hw->dst_addr = dest;
-		new->cookie = 0;
+		new->async_tx.cookie = 0;
+		new->async_tx.ack = 1;
 
 		/* chain together the physical address list for the HW */
 		if (!first)
 			first = new;
 		else
-			prev->hw->next = (u64) new->phys;
+			prev->hw->next = (u64) new->async_tx.phys;
 
 		prev = new;
-
 		len  -= copy;
-		dest += copy;
-		src  += copy;
-
 		list_add_tail(&new->node, &new_chain);
 		desc_count++;
 	}
-	new->hw->ctl = IOAT_DMA_DESCRIPTOR_CTL_CP_STS;
-	new->hw->next = 0;
 
-	/* cookie incr and addition to used_list must be atomic */
+	list_splice(&new_chain, &new->async_tx.tx_list);
 
-	cookie = ioat_chan->common.cookie;
-	cookie++;
-	if (cookie < 0)
-		cookie = 1;
-	ioat_chan->common.cookie = new->cookie = cookie;
+	new->hw->ctl = IOAT_DMA_DESCRIPTOR_CTL_CP_STS;
+	new->hw->next = 0;
+	new->tx_cnt = desc_count;
+	new->async_tx.ack = 0; /* client is in control of this ack */
+	new->async_tx.cookie = -EBUSY;
 
-	pci_unmap_addr_set(new, src, orig_src);
-	pci_unmap_addr_set(new, dst, orig_dst);
 	pci_unmap_len_set(new, src_len, orig_len);
 	pci_unmap_len_set(new, dst_len, orig_len);
-
-	/* write address into NextDescriptor field of last desc in chain */
-	to_ioat_desc(ioat_chan->used_desc.prev)->hw->next = first->phys;
-	list_splice_init(&new_chain, ioat_chan->used_desc.prev);
-
-	ioat_chan->pending += desc_count;
-	if (ioat_chan->pending >= 4) {
-		append = 1;
-		ioat_chan->pending = 0;
-	}
-
 	spin_unlock_bh(&ioat_chan->desc_lock);
 
-	if (append)
-		writeb(IOAT_CHANCMD_APPEND,
-		       ioat_chan->reg_base + IOAT_CHANCMD_OFFSET);
-	return cookie;
+	return new ? &new->async_tx : NULL;
 }
 
-/**
- * ioat_dma_memcpy_buf_to_buf - wrapper that takes src & dest bufs
- * @chan: IOAT DMA channel handle
- * @dest: DMA destination address
- * @src: DMA source address
- * @len: transaction length in bytes
- */
-
-static dma_cookie_t ioat_dma_memcpy_buf_to_buf(struct dma_chan *chan,
-                                               void *dest,
-                                               void *src,
-                                               size_t len)
-{
-	dma_addr_t dest_addr;
-	dma_addr_t src_addr;
-	struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
-
-	dest_addr = pci_map_single(ioat_chan->device->pdev,
-		dest, len, PCI_DMA_FROMDEVICE);
-	src_addr = pci_map_single(ioat_chan->device->pdev,
-		src, len, PCI_DMA_TODEVICE);
-
-	return do_ioat_dma_memcpy(ioat_chan, dest_addr, src_addr, len);
-}
-
-/**
- * ioat_dma_memcpy_buf_to_pg - wrapper, copying from a buf to a page
- * @chan: IOAT DMA channel handle
- * @page: pointer to the page to copy to
- * @offset: offset into that page
- * @src: DMA source address
- * @len: transaction length in bytes
- */
-
-static dma_cookie_t ioat_dma_memcpy_buf_to_pg(struct dma_chan *chan,
-                                              struct page *page,
-                                              unsigned int offset,
-                                              void *src,
-                                              size_t len)
-{
-	dma_addr_t dest_addr;
-	dma_addr_t src_addr;
-	struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
-
-	dest_addr = pci_map_page(ioat_chan->device->pdev,
-		page, offset, len, PCI_DMA_FROMDEVICE);
-	src_addr = pci_map_single(ioat_chan->device->pdev,
-		src, len, PCI_DMA_TODEVICE);
-
-	return do_ioat_dma_memcpy(ioat_chan, dest_addr, src_addr, len);
-}
-
-/**
- * ioat_dma_memcpy_pg_to_pg - wrapper, copying between two pages
- * @chan: IOAT DMA channel handle
- * @dest_pg: pointer to the page to copy to
- * @dest_off: offset into that page
- * @src_pg: pointer to the page to copy from
- * @src_off: offset into that page
- * @len: transaction length in bytes. This is guaranteed not to make a copy
- *	 across a page boundary.
- */
-
-static dma_cookie_t ioat_dma_memcpy_pg_to_pg(struct dma_chan *chan,
-                                             struct page *dest_pg,
-                                             unsigned int dest_off,
-                                             struct page *src_pg,
-                                             unsigned int src_off,
-                                             size_t len)
-{
-	dma_addr_t dest_addr;
-	dma_addr_t src_addr;
-	struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
-
-	dest_addr = pci_map_page(ioat_chan->device->pdev,
-		dest_pg, dest_off, len, PCI_DMA_FROMDEVICE);
-	src_addr = pci_map_page(ioat_chan->device->pdev,
-		src_pg, src_off, len, PCI_DMA_TODEVICE);
-
-	return do_ioat_dma_memcpy(ioat_chan, dest_addr, src_addr, len);
-}
 
 /**
  * ioat_dma_memcpy_issue_pending - push potentially unrecognized appended descriptors to hw
@@ -465,8 +416,8 @@ static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *chan)
 		 * exceeding xfercap, perhaps. If so, only the last one will
 		 * have a cookie, and require unmapping.
 		 */
-		if (desc->cookie) {
-			cookie = desc->cookie;
+		if (desc->async_tx.cookie) {
+			cookie = desc->async_tx.cookie;
 
 			/* yes we are unmapping both _page and _single alloc'd
 			   regions with unmap_page. Is this *really* that bad?
@@ -481,14 +432,19 @@ static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *chan)
 					PCI_DMA_TODEVICE);
 		}
 
-		if (desc->phys != phys_complete) {
-			/* a completed entry, but not the last, so cleanup */
-			list_del(&desc->node);
-			list_add_tail(&desc->node, &chan->free_desc);
+		if (desc->async_tx.phys != phys_complete) {
+			/* a completed entry, but not the last, so cleanup
+			 * if the client is done with the descriptor
+			 */
+			if (desc->async_tx.ack) {
+				list_del(&desc->node);
+				list_add_tail(&desc->node, &chan->free_desc);
+			} else
+				desc->async_tx.cookie = 0;
 		} else {
 			/* last used desc. Do not remove, so we can append from
 			   it, but don't look at it next time, either */
-			desc->cookie = 0;
+			desc->async_tx.cookie = 0;
 
 			/* TODO check status bits? */
 			break;
@@ -504,6 +460,17 @@ static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *chan)
 	spin_unlock(&chan->cleanup_lock);
 }
 
+static void ioat_dma_dependency_added(struct dma_chan *chan)
+{
+	struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+	spin_lock_bh(&ioat_chan->desc_lock);
+	if (ioat_chan->pending == 0) {
+		spin_unlock_bh(&ioat_chan->desc_lock);
+		ioat_dma_memcpy_cleanup(ioat_chan);
+	} else
+		spin_unlock_bh(&ioat_chan->desc_lock);
+}
+
 /**
  * ioat_dma_is_complete - poll the status of a IOAT DMA transaction
  * @chan: IOAT DMA channel handle
@@ -606,13 +573,14 @@ static void ioat_start_null_desc(struct ioat_dma_chan *ioat_chan)
 
 	desc->hw->ctl = IOAT_DMA_DESCRIPTOR_NUL;
 	desc->hw->next = 0;
+	desc->async_tx.ack = 1;
 
 	list_add_tail(&desc->node, &ioat_chan->used_desc);
 	spin_unlock_bh(&ioat_chan->desc_lock);
 
-	writel(((u64) desc->phys) & 0x00000000FFFFFFFF,
+	writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF,
 	       ioat_chan->reg_base + IOAT_CHAINADDR_OFFSET_LOW);
-	writel(((u64) desc->phys) >> 32,
+	writel(((u64) desc->async_tx.phys) >> 32,
 	       ioat_chan->reg_base + IOAT_CHAINADDR_OFFSET_HIGH);
 
 	writeb(IOAT_CHANCMD_START, ioat_chan->reg_base + IOAT_CHANCMD_OFFSET);
@@ -629,6 +597,8 @@ static int ioat_self_test(struct ioat_device *device)
 	u8 *src;
 	u8 *dest;
 	struct dma_chan *dma_chan;
+	struct dma_async_tx_descriptor *tx;
+	dma_addr_t addr;
 	dma_cookie_t cookie;
 	int err = 0;
 
@@ -654,7 +624,15 @@ static int ioat_self_test(struct ioat_device *device)
 		goto out;
 	}
 
-	cookie = ioat_dma_memcpy_buf_to_buf(dma_chan, dest, src, IOAT_TEST_SIZE);
+	tx = ioat_dma_prep_memcpy(dma_chan, IOAT_TEST_SIZE, 0);
+	async_tx_ack(tx);
+	addr = dma_map_single(dma_chan->device->dev, src, IOAT_TEST_SIZE,
+			DMA_TO_DEVICE);
+	ioat_set_src(addr, tx, 0);
+	addr = dma_map_single(dma_chan->device->dev, dest, IOAT_TEST_SIZE,
+			DMA_FROM_DEVICE);
+	ioat_set_dest(addr, tx, 0);
+	cookie = ioat_tx_submit(tx);
 	ioat_dma_memcpy_issue_pending(dma_chan);
 	msleep(1);
 
@@ -750,13 +728,14 @@ static int __devinit ioat_probe(struct pci_dev *pdev,
 	INIT_LIST_HEAD(&device->common.channels);
 	enumerate_dma_channels(device);
 
+	dma_cap_set(DMA_MEMCPY, device->common.cap_mask);
 	device->common.device_alloc_chan_resources = ioat_dma_alloc_chan_resources;
 	device->common.device_free_chan_resources = ioat_dma_free_chan_resources;
-	device->common.device_memcpy_buf_to_buf = ioat_dma_memcpy_buf_to_buf;
-	device->common.device_memcpy_buf_to_pg = ioat_dma_memcpy_buf_to_pg;
-	device->common.device_memcpy_pg_to_pg = ioat_dma_memcpy_pg_to_pg;
-	device->common.device_memcpy_complete = ioat_dma_is_complete;
-	device->common.device_memcpy_issue_pending = ioat_dma_memcpy_issue_pending;
+	device->common.device_prep_dma_memcpy = ioat_dma_prep_memcpy;
+	device->common.device_is_tx_complete = ioat_dma_is_complete;
+	device->common.device_issue_pending = ioat_dma_memcpy_issue_pending;
+	device->common.device_dependency_added = ioat_dma_dependency_added;
+	device->common.dev = &pdev->dev;
 	printk(KERN_INFO "Intel(R) I/OAT DMA Engine found, %d channels\n",
 		device->common.chancnt);
 
diff --git a/drivers/dma/ioatdma.h b/drivers/dma/ioatdma.h
index 62b26a9be4c9..d3f69bb15fa0 100644
--- a/drivers/dma/ioatdma.h
+++ b/drivers/dma/ioatdma.h
@@ -105,21 +105,20 @@ struct ioat_dma_chan {
 /**
  * struct ioat_desc_sw - wrapper around hardware descriptor
  * @hw: hardware DMA descriptor
- * @node:
- * @cookie:
- * @phys:
+ * @node: this descriptor will either be on the free list,
+ *     or attached to a transaction list (async_tx.tx_list)
+ * @tx_cnt: number of descriptors required to complete the transaction
+ * @async_tx: the generic software descriptor for all engines
  */
-
 struct ioat_desc_sw {
 	struct ioat_dma_descriptor *hw;
 	struct list_head node;
-	dma_cookie_t cookie;
-	dma_addr_t phys;
+	int tx_cnt;
 	DECLARE_PCI_UNMAP_ADDR(src)
 	DECLARE_PCI_UNMAP_LEN(src_len)
 	DECLARE_PCI_UNMAP_ADDR(dst)
 	DECLARE_PCI_UNMAP_LEN(dst_len)
+	struct dma_async_tx_descriptor async_tx;
 };
 
 #endif /* IOATDMA_H */
-
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index c94d8f1d62e5..3de1cf71031a 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -21,13 +21,12 @@
 #ifndef DMAENGINE_H
 #define DMAENGINE_H
 
-#ifdef CONFIG_DMA_ENGINE
-
 #include <linux/device.h>
 #include <linux/uio.h>
 #include <linux/kref.h>
 #include <linux/completion.h>
 #include <linux/rcupdate.h>
+#include <linux/dma-mapping.h>
 
 /**
  * enum dma_event - resource PNP/power managment events
@@ -64,6 +63,31 @@ enum dma_status {
 	DMA_ERROR,
 };
 
+/**
+ * enum dma_transaction_type - DMA transaction types/indexes
+ */
+enum dma_transaction_type {
+	DMA_MEMCPY,
+	DMA_XOR,
+	DMA_PQ_XOR,
+	DMA_DUAL_XOR,
+	DMA_PQ_UPDATE,
+	DMA_ZERO_SUM,
+	DMA_PQ_ZERO_SUM,
+	DMA_MEMSET,
+	DMA_MEMCPY_CRC32C,
+	DMA_INTERRUPT,
+};
+
+/* last transaction type for creation of the capabilities mask */
+#define DMA_TX_TYPE_END (DMA_INTERRUPT + 1)
+
+/**
+ * dma_cap_mask_t - capabilities bitmap modeled after cpumask_t.
+ * See linux/cpumask.h
+ */
+typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t;
+
 /**
  * struct dma_chan_percpu - the per-CPU part of struct dma_chan
  * @refcount: local_t used for open-coded "bigref" counting
@@ -157,48 +181,106 @@ struct dma_client {
 	struct list_head	global_node;
 };
 
+typedef void (*dma_async_tx_callback)(void *dma_async_param);
+/**
+ * struct dma_async_tx_descriptor - async transaction descriptor
+ * ---dma generic offload fields---
+ * @cookie: tracking cookie for this transaction, set to -EBUSY if
+ *	this tx is sitting on a dependency list
+ * @ack: the descriptor can not be reused until the client acknowledges
+ *	receipt, i.e. has has a chance to establish any dependency chains
+ * @phys: physical address of the descriptor
+ * @tx_list: driver common field for operations that require multiple
+ *	descriptors
+ * @chan: target channel for this operation
+ * @tx_submit: set the prepared descriptor(s) to be executed by the engine
+ * @tx_set_dest: set a destination address in a hardware descriptor
+ * @tx_set_src: set a source address in a hardware descriptor
+ * @callback: routine to call after this operation is complete
+ * @callback_param: general parameter to pass to the callback routine
+ * ---async_tx api specific fields---
+ * @depend_list: at completion this list of transactions are submitted
+ * @depend_node: allow this transaction to be executed after another
+ *	transaction has completed, possibly on another channel
+ * @parent: pointer to the next level up in the dependency chain
+ * @lock: protect the dependency list
+ */
+struct dma_async_tx_descriptor {
+	dma_cookie_t cookie;
+	int ack;
+	dma_addr_t phys;
+	struct list_head tx_list;
+	struct dma_chan *chan;
+	dma_cookie_t (*tx_submit)(struct dma_async_tx_descriptor *tx);
+	void (*tx_set_dest)(dma_addr_t addr,
+		struct dma_async_tx_descriptor *tx, int index);
+	void (*tx_set_src)(dma_addr_t addr,
+		struct dma_async_tx_descriptor *tx, int index);
+	dma_async_tx_callback callback;
+	void *callback_param;
+	struct list_head depend_list;
+	struct list_head depend_node;
+	struct dma_async_tx_descriptor *parent;
+	spinlock_t lock;
+};
+
 /**
  * struct dma_device - info on the entity supplying DMA services
  * @chancnt: how many DMA channels are supported
  * @channels: the list of struct dma_chan
  * @global_node: list_head for global dma_device_list
+ * @cap_mask: one or more dma_capability flags
+ * @max_xor: maximum number of xor sources, 0 if no capability
  * @refcount: reference count
  * @done: IO completion struct
  * @dev_id: unique device ID
+ * @dev: struct device reference for dma mapping api
  * @device_alloc_chan_resources: allocate resources and return the
  *	number of allocated descriptors
  * @device_free_chan_resources: release DMA channel's resources
- * @device_memcpy_buf_to_buf: memcpy buf pointer to buf pointer
- * @device_memcpy_buf_to_pg: memcpy buf pointer to struct page
- * @device_memcpy_pg_to_pg: memcpy struct page/offset to struct page/offset
- * @device_memcpy_complete: poll the status of an IOAT DMA transaction
- * @device_memcpy_issue_pending: push appended descriptors to hardware
+ * @device_prep_dma_memcpy: prepares a memcpy operation
+ * @device_prep_dma_xor: prepares a xor operation
+ * @device_prep_dma_zero_sum: prepares a zero_sum operation
+ * @device_prep_dma_memset: prepares a memset operation
+ * @device_prep_dma_interrupt: prepares an end of chain interrupt operation
+ * @device_dependency_added: async_tx notifies the channel about new deps
+ * @device_issue_pending: push pending transactions to hardware
  */
 struct dma_device {
 
 	unsigned int chancnt;
 	struct list_head channels;
 	struct list_head global_node;
+	dma_cap_mask_t  cap_mask;
+	int max_xor;
 
 	struct kref refcount;
 	struct completion done;
 
 	int dev_id;
+	struct device *dev;
 
 	int (*device_alloc_chan_resources)(struct dma_chan *chan);
 	void (*device_free_chan_resources)(struct dma_chan *chan);
-	dma_cookie_t (*device_memcpy_buf_to_buf)(struct dma_chan *chan,
-			void *dest, void *src, size_t len);
-	dma_cookie_t (*device_memcpy_buf_to_pg)(struct dma_chan *chan,
-			struct page *page, unsigned int offset, void *kdata,
-			size_t len);
-	dma_cookie_t (*device_memcpy_pg_to_pg)(struct dma_chan *chan,
-			struct page *dest_pg, unsigned int dest_off,
-			struct page *src_pg, unsigned int src_off, size_t len);
-	enum dma_status (*device_memcpy_complete)(struct dma_chan *chan,
+
+	struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)(
+		struct dma_chan *chan, size_t len, int int_en);
+	struct dma_async_tx_descriptor *(*device_prep_dma_xor)(
+		struct dma_chan *chan, unsigned int src_cnt, size_t len,
+		int int_en);
+	struct dma_async_tx_descriptor *(*device_prep_dma_zero_sum)(
+		struct dma_chan *chan, unsigned int src_cnt, size_t len,
+		u32 *result, int int_en);
+	struct dma_async_tx_descriptor *(*device_prep_dma_memset)(
+		struct dma_chan *chan, int value, size_t len, int int_en);
+	struct dma_async_tx_descriptor *(*device_prep_dma_interrupt)(
+		struct dma_chan *chan);
+
+	void (*device_dependency_added)(struct dma_chan *chan);
+	enum dma_status (*device_is_tx_complete)(struct dma_chan *chan,
 			dma_cookie_t cookie, dma_cookie_t *last,
 			dma_cookie_t *used);
-	void (*device_memcpy_issue_pending)(struct dma_chan *chan);
+	void (*device_issue_pending)(struct dma_chan *chan);
 };
 
 /* --- public DMA engine API --- */
@@ -207,96 +289,72 @@ struct dma_client *dma_async_client_register(dma_event_callback event_callback);
 void dma_async_client_unregister(struct dma_client *client);
 void dma_async_client_chan_request(struct dma_client *client,
 		unsigned int number);
+dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
+	void *dest, void *src, size_t len);
+dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
+	struct page *page, unsigned int offset, void *kdata, size_t len);
+dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan,
+	struct page *dest_pg, unsigned int dest_off, struct page *src_pg,
+	unsigned int src_off, size_t len);
+void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
+	struct dma_chan *chan);
 
-/**
- * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses
- * @chan: DMA channel to offload copy to
- * @dest: destination address (virtual)
- * @src: source address (virtual)
- * @len: length
- *
- * Both @dest and @src must be mappable to a bus address according to the
- * DMA mapping API rules for streaming mappings.
- * Both @dest and @src must stay memory resident (kernel memory or locked
- * user space pages).
- */
-static inline dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
-	void *dest, void *src, size_t len)
-{
-	int cpu = get_cpu();
-	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
-	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
-	put_cpu();
 
-	return chan->device->device_memcpy_buf_to_buf(chan, dest, src, len);
+static inline void
+async_tx_ack(struct dma_async_tx_descriptor *tx)
+{
+	tx->ack = 1;
 }
 
-/**
- * dma_async_memcpy_buf_to_pg - offloaded copy from address to page
- * @chan: DMA channel to offload copy to
- * @page: destination page
- * @offset: offset in page to copy to
- * @kdata: source address (virtual)
- * @len: length
- *
- * Both @page/@offset and @kdata must be mappable to a bus address according
- * to the DMA mapping API rules for streaming mappings.
- * Both @page/@offset and @kdata must stay memory resident (kernel memory or
- * locked user space pages)
- */
-static inline dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
-	struct page *page, unsigned int offset, void *kdata, size_t len)
+#define first_dma_cap(mask) __first_dma_cap(&(mask))
+static inline int __first_dma_cap(const dma_cap_mask_t *srcp)
 {
-	int cpu = get_cpu();
-	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
-	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
-	put_cpu();
+	return min_t(int, DMA_TX_TYPE_END,
+		find_first_bit(srcp->bits, DMA_TX_TYPE_END));
+}
 
-	return chan->device->device_memcpy_buf_to_pg(chan, page, offset,
-	                                             kdata, len);
+#define next_dma_cap(n, mask) __next_dma_cap((n), &(mask))
+static inline int __next_dma_cap(int n, const dma_cap_mask_t *srcp)
+{
+	return min_t(int, DMA_TX_TYPE_END,
+		find_next_bit(srcp->bits, DMA_TX_TYPE_END, n+1));
 }
 
-/**
- * dma_async_memcpy_pg_to_pg - offloaded copy from page to page
- * @chan: DMA channel to offload copy to
- * @dest_pg: destination page
- * @dest_off: offset in page to copy to
- * @src_pg: source page
- * @src_off: offset in page to copy from
- * @len: length
- *
- * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus
- * address according to the DMA mapping API rules for streaming mappings.
- * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident
- * (kernel memory or locked user space pages).
- */
-static inline dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan,
-	struct page *dest_pg, unsigned int dest_off, struct page *src_pg,
-	unsigned int src_off, size_t len)
+#define dma_cap_set(tx, mask) __dma_cap_set((tx), &(mask))
+static inline void
+__dma_cap_set(enum dma_transaction_type tx_type, dma_cap_mask_t *dstp)
 {
-	int cpu = get_cpu();
-	per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
-	per_cpu_ptr(chan->local, cpu)->memcpy_count++;
-	put_cpu();
+	set_bit(tx_type, dstp->bits);
+}
 
-	return chan->device->device_memcpy_pg_to_pg(chan, dest_pg, dest_off,
-	                                            src_pg, src_off, len);
+#define dma_has_cap(tx, mask) __dma_has_cap((tx), &(mask))
+static inline int
+__dma_has_cap(enum dma_transaction_type tx_type, dma_cap_mask_t *srcp)
+{
+	return test_bit(tx_type, srcp->bits);
 }
 
+#define for_each_dma_cap_mask(cap, mask) \
+	for ((cap) = first_dma_cap(mask);	\
+		(cap) < DMA_TX_TYPE_END;	\
+		(cap) = next_dma_cap((cap), (mask)))
+
 /**
- * dma_async_memcpy_issue_pending - flush pending copies to HW
+ * dma_async_issue_pending - flush pending transactions to HW
  * @chan: target DMA channel
  *
  * This allows drivers to push copies to HW in batches,
  * reducing MMIO writes where possible.
  */
-static inline void dma_async_memcpy_issue_pending(struct dma_chan *chan)
+static inline void dma_async_issue_pending(struct dma_chan *chan)
 {
-	return chan->device->device_memcpy_issue_pending(chan);
+	return chan->device->device_issue_pending(chan);
 }
 
+#define dma_async_memcpy_issue_pending(chan) dma_async_issue_pending(chan)
+
 /**
- * dma_async_memcpy_complete - poll for transaction completion
+ * dma_async_is_tx_complete - poll for transaction completion
  * @chan: DMA channel
  * @cookie: transaction identifier to check status of
  * @last: returns last completed cookie, can be NULL
@@ -306,12 +364,15 @@ static inline void dma_async_memcpy_issue_pending(struct dma_chan *chan)
  * internal state and can be used with dma_async_is_complete() to check
  * the status of multiple cookies without re-checking hardware state.
  */
-static inline enum dma_status dma_async_memcpy_complete(struct dma_chan *chan,
+static inline enum dma_status dma_async_is_tx_complete(struct dma_chan *chan,
 	dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used)
 {
-	return chan->device->device_memcpy_complete(chan, cookie, last, used);
+	return chan->device->device_is_tx_complete(chan, cookie, last, used);
 }
 
+#define dma_async_memcpy_complete(chan, cookie, last, used)\
+	dma_async_is_tx_complete(chan, cookie, last, used)
+
 /**
  * dma_async_is_complete - test a cookie against chan state
  * @cookie: transaction identifier to test status of
@@ -334,6 +395,7 @@ static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie,
 	return DMA_IN_PROGRESS;
 }
 
+enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie);
 
 /* --- DMA device --- */
 
@@ -362,5 +424,4 @@ dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov,
 	struct dma_pinned_list *pinned_list, struct page *page,
 	unsigned int offset, size_t len);
 
-#endif /* CONFIG_DMA_ENGINE */
 #endif /* DMAENGINE_H */
-- 
cgit v1.3-8-gc7d7


From d379b01e9087a582d58f4b678208a4f8d8376fe7 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 9 Jul 2007 11:56:42 -0700
Subject: dmaengine: make clients responsible for managing channels

The current implementation assumes that a channel will only be used by one
client at a time.  In order to enable channel sharing the dmaengine core is
changed to a model where clients subscribe to channel-available-events.
Instead of tracking how many channels a client wants and how many it has
received the core just broadcasts the available channels and lets the
clients optionally take a reference.  The core learns about the clients'
needs at dma_event_callback time.

In support of multiple operation types, clients can specify a capability
mask to only be notified of channels that satisfy a certain set of
capabilities.

Changelog:
* removed DMA_TX_ARRAY_INIT, no longer needed
* dma_client_chan_free -> dma_chan_release: switch to global reference
  counting only at device unregistration time, before it was also happening
  at client unregistration time
* clients now return dma_state_client to dmaengine (ack, dup, nak)
* checkpatch.pl fixes
* fixup merge with git-ioat

Cc: Chris Leech <christopher.leech@intel.com>
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 drivers/dma/dmaengine.c   | 217 ++++++++++++++++++++++++----------------------
 drivers/dma/ioatdma.c     |   1 -
 drivers/dma/ioatdma.h     |   3 -
 include/linux/dmaengine.h |  58 ++++++++-----
 net/core/dev.c            | 112 ++++++++++++++++--------
 5 files changed, 224 insertions(+), 167 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 404cc7b6e705..82489923af09 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -37,11 +37,11 @@
  * Each device has a channels list, which runs unlocked but is never modified
  * once the device is registered, it's just setup by the driver.
  *
- * Each client has a channels list, it's only modified under the client->lock
- * and in an RCU callback, so it's safe to read under rcu_read_lock().
+ * Each client is responsible for keeping track of the channels it uses.  See
+ * the definition of dma_event_callback in dmaengine.h.
  *
  * Each device has a kref, which is initialized to 1 when the device is
- * registered. A kref_put is done for each class_device registered.  When the
+ * registered. A kref_get is done for each class_device registered.  When the
  * class_device is released, the coresponding kref_put is done in the release
  * method. Every time one of the device's channels is allocated to a client,
  * a kref_get occurs.  When the channel is freed, the coresponding kref_put
@@ -51,10 +51,12 @@
  * references to finish.
  *
  * Each channel has an open-coded implementation of Rusty Russell's "bigref,"
- * with a kref and a per_cpu local_t.  A single reference is set when on an
- * ADDED event, and removed with a REMOVE event.  Net DMA client takes an
- * extra reference per outstanding transaction.  The relase function does a
- * kref_put on the device. -ChrisL
+ * with a kref and a per_cpu local_t.  A dma_chan_get is called when a client
+ * signals that it wants to use a channel, and dma_chan_put is called when
+ * a channel is removed or a client using it is unregesitered.  A client can
+ * take extra references per outstanding transaction, as is the case with
+ * the NET DMA client.  The release function does a kref_put on the device.
+ *	-ChrisL, DanW
  */
 
 #include <linux/init.h>
@@ -102,8 +104,19 @@ static ssize_t show_bytes_transferred(struct class_device *cd, char *buf)
 static ssize_t show_in_use(struct class_device *cd, char *buf)
 {
 	struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+	int in_use = 0;
+
+	if (unlikely(chan->slow_ref) &&
+		atomic_read(&chan->refcount.refcount) > 1)
+		in_use = 1;
+	else {
+		if (local_read(&(per_cpu_ptr(chan->local,
+			get_cpu())->refcount)) > 0)
+			in_use = 1;
+		put_cpu();
+	}
 
-	return sprintf(buf, "%d\n", (chan->client ? 1 : 0));
+	return sprintf(buf, "%d\n", in_use);
 }
 
 static struct class_device_attribute dma_class_attrs[] = {
@@ -129,42 +142,53 @@ static struct class dma_devclass = {
 
 /* --- client and device registration --- */
 
+#define dma_chan_satisfies_mask(chan, mask) \
+	__dma_chan_satisfies_mask((chan), &(mask))
+static int
+__dma_chan_satisfies_mask(struct dma_chan *chan, dma_cap_mask_t *want)
+{
+	dma_cap_mask_t has;
+
+	bitmap_and(has.bits, want->bits, chan->device->cap_mask.bits,
+		DMA_TX_TYPE_END);
+	return bitmap_equal(want->bits, has.bits, DMA_TX_TYPE_END);
+}
+
 /**
- * dma_client_chan_alloc - try to allocate a channel to a client
+ * dma_client_chan_alloc - try to allocate channels to a client
  * @client: &dma_client
  *
  * Called with dma_list_mutex held.
  */
-static struct dma_chan *dma_client_chan_alloc(struct dma_client *client)
+static void dma_client_chan_alloc(struct dma_client *client)
 {
 	struct dma_device *device;
 	struct dma_chan *chan;
-	unsigned long flags;
 	int desc;	/* allocated descriptor count */
+	enum dma_state_client ack;
 
-	/* Find a channel, any DMA engine will do */
-	list_for_each_entry(device, &dma_device_list, global_node) {
+	/* Find a channel */
+	list_for_each_entry(device, &dma_device_list, global_node)
 		list_for_each_entry(chan, &device->channels, device_node) {
-			if (chan->client)
+			if (!dma_chan_satisfies_mask(chan, client->cap_mask))
 				continue;
 
 			desc = chan->device->device_alloc_chan_resources(chan);
 			if (desc >= 0) {
-				kref_get(&device->refcount);
-				kref_init(&chan->refcount);
-				chan->slow_ref = 0;
-				INIT_RCU_HEAD(&chan->rcu);
-				chan->client = client;
-				spin_lock_irqsave(&client->lock, flags);
-				list_add_tail_rcu(&chan->client_node,
-				                  &client->channels);
-				spin_unlock_irqrestore(&client->lock, flags);
-				return chan;
+				ack = client->event_callback(client,
+						chan,
+						DMA_RESOURCE_AVAILABLE);
+
+				/* we are done once this client rejects
+				 * an available resource
+				 */
+				if (ack == DMA_ACK) {
+					dma_chan_get(chan);
+					kref_get(&device->refcount);
+				} else if (ack == DMA_NAK)
+					return;
 			}
 		}
-	}
-
-	return NULL;
 }
 
 enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie)
@@ -193,7 +217,6 @@ void dma_chan_cleanup(struct kref *kref)
 {
 	struct dma_chan *chan = container_of(kref, struct dma_chan, refcount);
 	chan->device->device_free_chan_resources(chan);
-	chan->client = NULL;
 	kref_put(&chan->device->refcount, dma_async_device_cleanup);
 }
 EXPORT_SYMBOL(dma_chan_cleanup);
@@ -209,7 +232,7 @@ static void dma_chan_free_rcu(struct rcu_head *rcu)
 	kref_put(&chan->refcount, dma_chan_cleanup);
 }
 
-static void dma_client_chan_free(struct dma_chan *chan)
+static void dma_chan_release(struct dma_chan *chan)
 {
 	atomic_add(0x7FFFFFFF, &chan->refcount.refcount);
 	chan->slow_ref = 1;
@@ -217,70 +240,57 @@ static void dma_client_chan_free(struct dma_chan *chan)
 }
 
 /**
- * dma_chans_rebalance - reallocate channels to clients
- *
- * When the number of DMA channel in the system changes,
- * channels need to be rebalanced among clients.
+ * dma_chans_notify_available - broadcast available channels to the clients
  */
-static void dma_chans_rebalance(void)
+static void dma_clients_notify_available(void)
 {
 	struct dma_client *client;
-	struct dma_chan *chan;
-	unsigned long flags;
 
 	mutex_lock(&dma_list_mutex);
 
-	list_for_each_entry(client, &dma_client_list, global_node) {
-		while (client->chans_desired > client->chan_count) {
-			chan = dma_client_chan_alloc(client);
-			if (!chan)
-				break;
-			client->chan_count++;
-			client->event_callback(client,
-	                                       chan,
-	                                       DMA_RESOURCE_ADDED);
-		}
-		while (client->chans_desired < client->chan_count) {
-			spin_lock_irqsave(&client->lock, flags);
-			chan = list_entry(client->channels.next,
-			                  struct dma_chan,
-			                  client_node);
-			list_del_rcu(&chan->client_node);
-			spin_unlock_irqrestore(&client->lock, flags);
-			client->chan_count--;
-			client->event_callback(client,
-			                       chan,
-			                       DMA_RESOURCE_REMOVED);
-			dma_client_chan_free(chan);
-		}
-	}
+	list_for_each_entry(client, &dma_client_list, global_node)
+		dma_client_chan_alloc(client);
 
 	mutex_unlock(&dma_list_mutex);
 }
 
 /**
- * dma_async_client_register - allocate and register a &dma_client
- * @event_callback: callback for notification of channel addition/removal
+ * dma_chans_notify_available - tell the clients that a channel is going away
+ * @chan: channel on its way out
  */
-struct dma_client *dma_async_client_register(dma_event_callback event_callback)
+static void dma_clients_notify_removed(struct dma_chan *chan)
 {
 	struct dma_client *client;
+	enum dma_state_client ack;
 
-	client = kzalloc(sizeof(*client), GFP_KERNEL);
-	if (!client)
-		return NULL;
+	mutex_lock(&dma_list_mutex);
+
+	list_for_each_entry(client, &dma_client_list, global_node) {
+		ack = client->event_callback(client, chan,
+				DMA_RESOURCE_REMOVED);
+
+		/* client was holding resources for this channel so
+		 * free it
+		 */
+		if (ack == DMA_ACK) {
+			dma_chan_put(chan);
+			kref_put(&chan->device->refcount,
+				dma_async_device_cleanup);
+		}
+	}
 
-	INIT_LIST_HEAD(&client->channels);
-	spin_lock_init(&client->lock);
-	client->chans_desired = 0;
-	client->chan_count = 0;
-	client->event_callback = event_callback;
+	mutex_unlock(&dma_list_mutex);
+}
 
+/**
+ * dma_async_client_register - register a &dma_client
+ * @client: ptr to a client structure with valid 'event_callback' and 'cap_mask'
+ */
+void dma_async_client_register(struct dma_client *client)
+{
 	mutex_lock(&dma_list_mutex);
 	list_add_tail(&client->global_node, &dma_client_list);
 	mutex_unlock(&dma_list_mutex);
-
-	return client;
 }
 EXPORT_SYMBOL(dma_async_client_register);
 
@@ -292,40 +302,42 @@ EXPORT_SYMBOL(dma_async_client_register);
  */
 void dma_async_client_unregister(struct dma_client *client)
 {
+	struct dma_device *device;
 	struct dma_chan *chan;
+	enum dma_state_client ack;
 
 	if (!client)
 		return;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(chan, &client->channels, client_node)
-		dma_client_chan_free(chan);
-	rcu_read_unlock();
-
 	mutex_lock(&dma_list_mutex);
+	/* free all channels the client is holding */
+	list_for_each_entry(device, &dma_device_list, global_node)
+		list_for_each_entry(chan, &device->channels, device_node) {
+			ack = client->event_callback(client, chan,
+				DMA_RESOURCE_REMOVED);
+
+			if (ack == DMA_ACK) {
+				dma_chan_put(chan);
+				kref_put(&chan->device->refcount,
+					dma_async_device_cleanup);
+			}
+		}
+
 	list_del(&client->global_node);
 	mutex_unlock(&dma_list_mutex);
-
-	kfree(client);
-	dma_chans_rebalance();
 }
 EXPORT_SYMBOL(dma_async_client_unregister);
 
 /**
- * dma_async_client_chan_request - request DMA channels
- * @client: &dma_client
- * @number: count of DMA channels requested
- *
- * Clients call dma_async_client_chan_request() to specify how many
- * DMA channels they need, 0 to free all currently allocated.
- * The resulting allocations/frees are indicated to the client via the
- * event callback.
+ * dma_async_client_chan_request - send all available channels to the
+ * client that satisfy the capability mask
+ * @client - requester
  */
-void dma_async_client_chan_request(struct dma_client *client,
-			unsigned int number)
+void dma_async_client_chan_request(struct dma_client *client)
 {
-	client->chans_desired = number;
-	dma_chans_rebalance();
+	mutex_lock(&dma_list_mutex);
+	dma_client_chan_alloc(client);
+	mutex_unlock(&dma_list_mutex);
 }
 EXPORT_SYMBOL(dma_async_client_chan_request);
 
@@ -386,13 +398,16 @@ int dma_async_device_register(struct dma_device *device)
 		}
 
 		kref_get(&device->refcount);
+		kref_init(&chan->refcount);
+		chan->slow_ref = 0;
+		INIT_RCU_HEAD(&chan->rcu);
 	}
 
 	mutex_lock(&dma_list_mutex);
 	list_add_tail(&device->global_node, &dma_device_list);
 	mutex_unlock(&dma_list_mutex);
 
-	dma_chans_rebalance();
+	dma_clients_notify_available();
 
 	return 0;
 
@@ -428,26 +443,16 @@ static void dma_async_device_cleanup(struct kref *kref)
 void dma_async_device_unregister(struct dma_device *device)
 {
 	struct dma_chan *chan;
-	unsigned long flags;
 
 	mutex_lock(&dma_list_mutex);
 	list_del(&device->global_node);
 	mutex_unlock(&dma_list_mutex);
 
 	list_for_each_entry(chan, &device->channels, device_node) {
-		if (chan->client) {
-			spin_lock_irqsave(&chan->client->lock, flags);
-			list_del(&chan->client_node);
-			chan->client->chan_count--;
-			spin_unlock_irqrestore(&chan->client->lock, flags);
-			chan->client->event_callback(chan->client,
-			                             chan,
-			                             DMA_RESOURCE_REMOVED);
-			dma_client_chan_free(chan);
-		}
+		dma_clients_notify_removed(chan);
 		class_device_unregister(&chan->class_dev);
+		dma_chan_release(chan);
 	}
-	dma_chans_rebalance();
 
 	kref_put(&device->refcount, dma_async_device_cleanup);
 	wait_for_completion(&device->done);
diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index 171044930282..81810b3042f1 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -72,7 +72,6 @@ static int enumerate_dma_channels(struct ioat_device *device)
 		INIT_LIST_HEAD(&ioat_chan->used_desc);
 		/* This should be made common somewhere in dmaengine.c */
 		ioat_chan->common.device = &device->common;
-		ioat_chan->common.client = NULL;
 		list_add_tail(&ioat_chan->common.device_node,
 		              &device->common.channels);
 	}
diff --git a/drivers/dma/ioatdma.h b/drivers/dma/ioatdma.h
index d3f69bb15fa0..d3726478031a 100644
--- a/drivers/dma/ioatdma.h
+++ b/drivers/dma/ioatdma.h
@@ -30,9 +30,6 @@
 
 #define IOAT_LOW_COMPLETION_MASK	0xffffffc0
 
-extern struct list_head dma_device_list;
-extern struct list_head dma_client_list;
-
 /**
  * struct ioat_device - internal representation of a IOAT device
  * @pdev: PCI-Express device
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 3de1cf71031a..a3b6035b6c86 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -29,19 +29,31 @@
 #include <linux/dma-mapping.h>
 
 /**
- * enum dma_event - resource PNP/power managment events
+ * enum dma_state - resource PNP/power managment state
  * @DMA_RESOURCE_SUSPEND: DMA device going into low power state
  * @DMA_RESOURCE_RESUME: DMA device returning to full power
- * @DMA_RESOURCE_ADDED: DMA device added to the system
+ * @DMA_RESOURCE_AVAILABLE: DMA device available to the system
  * @DMA_RESOURCE_REMOVED: DMA device removed from the system
  */
-enum dma_event {
+enum dma_state {
 	DMA_RESOURCE_SUSPEND,
 	DMA_RESOURCE_RESUME,
-	DMA_RESOURCE_ADDED,
+	DMA_RESOURCE_AVAILABLE,
 	DMA_RESOURCE_REMOVED,
 };
 
+/**
+ * enum dma_state_client - state of the channel in the client
+ * @DMA_ACK: client would like to use, or was using this channel
+ * @DMA_DUP: client has already seen this channel, or is not using this channel
+ * @DMA_NAK: client does not want to see any more channels
+ */
+enum dma_state_client {
+	DMA_ACK,
+	DMA_DUP,
+	DMA_NAK,
+};
+
 /**
  * typedef dma_cookie_t - an opaque DMA cookie
  *
@@ -104,7 +116,6 @@ struct dma_chan_percpu {
 
 /**
  * struct dma_chan - devices supply DMA channels, clients use them
- * @client: ptr to the client user of this chan, will be %NULL when unused
  * @device: ptr to the dma device who supplies this channel, always !%NULL
  * @cookie: last cookie value returned to client
  * @chan_id: channel ID for sysfs
@@ -112,12 +123,10 @@ struct dma_chan_percpu {
  * @refcount: kref, used in "bigref" slow-mode
  * @slow_ref: indicates that the DMA channel is free
  * @rcu: the DMA channel's RCU head
- * @client_node: used to add this to the client chan list
  * @device_node: used to add this to the device chan list
  * @local: per-cpu pointer to a struct dma_chan_percpu
  */
 struct dma_chan {
-	struct dma_client *client;
 	struct dma_device *device;
 	dma_cookie_t cookie;
 
@@ -129,11 +138,11 @@ struct dma_chan {
 	int slow_ref;
 	struct rcu_head rcu;
 
-	struct list_head client_node;
 	struct list_head device_node;
 	struct dma_chan_percpu *local;
 };
 
+
 void dma_chan_cleanup(struct kref *kref);
 
 static inline void dma_chan_get(struct dma_chan *chan)
@@ -158,26 +167,31 @@ static inline void dma_chan_put(struct dma_chan *chan)
 
 /*
  * typedef dma_event_callback - function pointer to a DMA event callback
+ * For each channel added to the system this routine is called for each client.
+ * If the client would like to use the channel it returns '1' to signal (ack)
+ * the dmaengine core to take out a reference on the channel and its
+ * corresponding device.  A client must not 'ack' an available channel more
+ * than once.  When a channel is removed all clients are notified.  If a client
+ * is using the channel it must 'ack' the removal.  A client must not 'ack' a
+ * removed channel more than once.
+ * @client - 'this' pointer for the client context
+ * @chan - channel to be acted upon
+ * @state - available or removed
  */
-typedef void (*dma_event_callback) (struct dma_client *client,
-		struct dma_chan *chan, enum dma_event event);
+struct dma_client;
+typedef enum dma_state_client (*dma_event_callback) (struct dma_client *client,
+		struct dma_chan *chan, enum dma_state state);
 
 /**
  * struct dma_client - info on the entity making use of DMA services
  * @event_callback: func ptr to call when something happens
- * @chan_count: number of chans allocated
- * @chans_desired: number of chans requested. Can be +/- chan_count
- * @lock: protects access to the channels list
- * @channels: the list of DMA channels allocated
+ * @cap_mask: only return channels that satisfy the requested capabilities
+ *  a value of zero corresponds to any capability
  * @global_node: list_head for global dma_client_list
  */
 struct dma_client {
 	dma_event_callback	event_callback;
-	unsigned int		chan_count;
-	unsigned int		chans_desired;
-
-	spinlock_t		lock;
-	struct list_head	channels;
+	dma_cap_mask_t		cap_mask;
 	struct list_head	global_node;
 };
 
@@ -285,10 +299,9 @@ struct dma_device {
 
 /* --- public DMA engine API --- */
 
-struct dma_client *dma_async_client_register(dma_event_callback event_callback);
+void dma_async_client_register(struct dma_client *client);
 void dma_async_client_unregister(struct dma_client *client);
-void dma_async_client_chan_request(struct dma_client *client,
-		unsigned int number);
+void dma_async_client_chan_request(struct dma_client *client);
 dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
 	void *dest, void *src, size_t len);
 dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
@@ -299,7 +312,6 @@ dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan,
 void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
 	struct dma_chan *chan);
 
-
 static inline void
 async_tx_ack(struct dma_async_tx_descriptor *tx)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index ee051bb398a0..835202fb34c4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -151,9 +151,22 @@ static struct list_head ptype_base[16] __read_mostly;	/* 16 way hashed list */
 static struct list_head ptype_all __read_mostly;	/* Taps */
 
 #ifdef CONFIG_NET_DMA
-static struct dma_client *net_dma_client;
-static unsigned int net_dma_count;
-static spinlock_t net_dma_event_lock;
+struct net_dma {
+	struct dma_client client;
+	spinlock_t lock;
+	cpumask_t channel_mask;
+	struct dma_chan *channels[NR_CPUS];
+};
+
+static enum dma_state_client
+netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
+	enum dma_state state);
+
+static struct net_dma net_dma = {
+	.client = {
+		.event_callback = netdev_dma_event,
+	},
+};
 #endif
 
 /*
@@ -2015,12 +2028,13 @@ out:
 	 * There may not be any more sk_buffs coming right now, so push
 	 * any pending DMA copies to hardware
 	 */
-	if (net_dma_client) {
-		struct dma_chan *chan;
-		rcu_read_lock();
-		list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node)
-			dma_async_memcpy_issue_pending(chan);
-		rcu_read_unlock();
+	if (!cpus_empty(net_dma.channel_mask)) {
+		int chan_idx;
+		for_each_cpu_mask(chan_idx, net_dma.channel_mask) {
+			struct dma_chan *chan = net_dma.channels[chan_idx];
+			if (chan)
+				dma_async_memcpy_issue_pending(chan);
+		}
 	}
 #endif
 	return;
@@ -3563,12 +3577,13 @@ static int dev_cpu_callback(struct notifier_block *nfb,
  * This is called when the number of channels allocated to the net_dma_client
  * changes.  The net_dma_client tries to have one DMA channel per CPU.
  */
-static void net_dma_rebalance(void)
+
+static void net_dma_rebalance(struct net_dma *net_dma)
 {
-	unsigned int cpu, i, n;
+	unsigned int cpu, i, n, chan_idx;
 	struct dma_chan *chan;
 
-	if (net_dma_count == 0) {
+	if (cpus_empty(net_dma->channel_mask)) {
 		for_each_online_cpu(cpu)
 			rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
 		return;
@@ -3577,10 +3592,12 @@ static void net_dma_rebalance(void)
 	i = 0;
 	cpu = first_cpu(cpu_online_map);
 
-	rcu_read_lock();
-	list_for_each_entry(chan, &net_dma_client->channels, client_node) {
-		n = ((num_online_cpus() / net_dma_count)
-		   + (i < (num_online_cpus() % net_dma_count) ? 1 : 0));
+	for_each_cpu_mask(chan_idx, net_dma->channel_mask) {
+		chan = net_dma->channels[chan_idx];
+
+		n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
+		   + (i < (num_online_cpus() %
+			cpus_weight(net_dma->channel_mask)) ? 1 : 0));
 
 		while(n) {
 			per_cpu(softnet_data, cpu).net_dma = chan;
@@ -3589,7 +3606,6 @@ static void net_dma_rebalance(void)
 		}
 		i++;
 	}
-	rcu_read_unlock();
 }
 
 /**
@@ -3598,23 +3614,53 @@ static void net_dma_rebalance(void)
  * @chan: DMA channel for the event
  * @event: event type
  */
-static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
-	enum dma_event event)
-{
-	spin_lock(&net_dma_event_lock);
-	switch (event) {
-	case DMA_RESOURCE_ADDED:
-		net_dma_count++;
-		net_dma_rebalance();
+static enum dma_state_client
+netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
+	enum dma_state state)
+{
+	int i, found = 0, pos = -1;
+	struct net_dma *net_dma =
+		container_of(client, struct net_dma, client);
+	enum dma_state_client ack = DMA_DUP; /* default: take no action */
+
+	spin_lock(&net_dma->lock);
+	switch (state) {
+	case DMA_RESOURCE_AVAILABLE:
+		for (i = 0; i < NR_CPUS; i++)
+			if (net_dma->channels[i] == chan) {
+				found = 1;
+				break;
+			} else if (net_dma->channels[i] == NULL && pos < 0)
+				pos = i;
+
+		if (!found && pos >= 0) {
+			ack = DMA_ACK;
+			net_dma->channels[pos] = chan;
+			cpu_set(pos, net_dma->channel_mask);
+			net_dma_rebalance(net_dma);
+		}
 		break;
 	case DMA_RESOURCE_REMOVED:
-		net_dma_count--;
-		net_dma_rebalance();
+		for (i = 0; i < NR_CPUS; i++)
+			if (net_dma->channels[i] == chan) {
+				found = 1;
+				pos = i;
+				break;
+			}
+
+		if (found) {
+			ack = DMA_ACK;
+			cpu_clear(pos, net_dma->channel_mask);
+			net_dma->channels[i] = NULL;
+			net_dma_rebalance(net_dma);
+		}
 		break;
 	default:
 		break;
 	}
-	spin_unlock(&net_dma_event_lock);
+	spin_unlock(&net_dma->lock);
+
+	return ack;
 }
 
 /**
@@ -3622,12 +3668,10 @@ static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
  */
 static int __init netdev_dma_register(void)
 {
-	spin_lock_init(&net_dma_event_lock);
-	net_dma_client = dma_async_client_register(netdev_dma_event);
-	if (net_dma_client == NULL)
-		return -ENOMEM;
-
-	dma_async_client_chan_request(net_dma_client, num_online_cpus());
+	spin_lock_init(&net_dma.lock);
+	dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
+	dma_async_client_register(&net_dma.client);
+	dma_async_client_chan_request(&net_dma.client);
 	return 0;
 }
 
-- 
cgit v1.3-8-gc7d7


From 685784aaf3cd0e3ff5e36c7ecf6f441cdbf57f73 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 9 Jul 2007 11:56:42 -0700
Subject: xor: make 'xor_blocks' a library routine for use with async_tx

The async_tx api tries to use a dma engine for an operation, but will fall
back to an optimized software routine otherwise.  Xor support is
implemented using the raid5 xor routines.  For organizational purposes this
routine is moved to a common area.

The following fixes are also made:
* rename xor_block => xor_blocks, suggested by Adrian Bunk
* ensure that xor.o initializes before md.o in the built-in case
* checkpatch.pl fixes
* mark calibrate_xor_blocks __init, Adrian Bunk

Cc: Adrian Bunk <bunk@stusta.de>
Cc: NeilBrown <neilb@suse.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 crypto/Kconfig           |   6 ++
 crypto/Makefile          |   6 ++
 crypto/xor.c             | 156 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/Kconfig       |   1 +
 drivers/md/Makefile      |   4 +-
 drivers/md/md.c          |   2 +-
 drivers/md/raid5.c       |  10 +--
 drivers/md/xor.c         | 154 ----------------------------------------------
 include/linux/raid/xor.h |   2 +-
 9 files changed, 178 insertions(+), 163 deletions(-)
 create mode 100644 crypto/xor.c
 delete mode 100644 drivers/md/xor.c

(limited to 'include/linux')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 4ca0ab3448d9..b749a1a46e22 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1,3 +1,9 @@
+#
+# Generic algorithms support
+#
+config XOR_BLOCKS
+	tristate
+
 #
 # Cryptographic API Configuration
 #
diff --git a/crypto/Makefile b/crypto/Makefile
index cce46a1c9dc7..68e934b4bee2 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -50,3 +50,9 @@ obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o
 obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o
 
 obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o
+
+#
+# generic algorithms and the async_tx api
+#
+obj-$(CONFIG_XOR_BLOCKS) += xor.o
+
diff --git a/crypto/xor.c b/crypto/xor.c
new file mode 100644
index 000000000000..8281ac5e68a8
--- /dev/null
+++ b/crypto/xor.c
@@ -0,0 +1,156 @@
+/*
+ * xor.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1996, 1997, 1998, 1999, 2000,
+ * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
+ *
+ * Dispatch optimized RAID-5 checksumming functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define BH_TRACE 0
+#include <linux/module.h>
+#include <linux/raid/md.h>
+#include <linux/raid/xor.h>
+#include <asm/xor.h>
+
+/* The xor routines to use.  */
+static struct xor_block_template *active_template;
+
+void
+xor_blocks(unsigned int count, unsigned int bytes, void **ptr)
+{
+	unsigned long *p0, *p1, *p2, *p3, *p4;
+
+	p0 = (unsigned long *) ptr[0];
+	p1 = (unsigned long *) ptr[1];
+	if (count == 2) {
+		active_template->do_2(bytes, p0, p1);
+		return;
+	}
+
+	p2 = (unsigned long *) ptr[2];
+	if (count == 3) {
+		active_template->do_3(bytes, p0, p1, p2);
+		return;
+	}
+
+	p3 = (unsigned long *) ptr[3];
+	if (count == 4) {
+		active_template->do_4(bytes, p0, p1, p2, p3);
+		return;
+	}
+
+	p4 = (unsigned long *) ptr[4];
+	active_template->do_5(bytes, p0, p1, p2, p3, p4);
+}
+EXPORT_SYMBOL(xor_blocks);
+
+/* Set of all registered templates.  */
+static struct xor_block_template *template_list;
+
+#define BENCH_SIZE (PAGE_SIZE)
+
+static void
+do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
+{
+	int speed;
+	unsigned long now;
+	int i, count, max;
+
+	tmpl->next = template_list;
+	template_list = tmpl;
+
+	/*
+	 * Count the number of XORs done during a whole jiffy, and use
+	 * this to calculate the speed of checksumming.  We use a 2-page
+	 * allocation to have guaranteed color L1-cache layout.
+	 */
+	max = 0;
+	for (i = 0; i < 5; i++) {
+		now = jiffies;
+		count = 0;
+		while (jiffies == now) {
+			mb(); /* prevent loop optimzation */
+			tmpl->do_2(BENCH_SIZE, b1, b2);
+			mb();
+			count++;
+			mb();
+		}
+		if (count > max)
+			max = count;
+	}
+
+	speed = max * (HZ * BENCH_SIZE / 1024);
+	tmpl->speed = speed;
+
+	printk(KERN_INFO "   %-10s: %5d.%03d MB/sec\n", tmpl->name,
+	       speed / 1000, speed % 1000);
+}
+
+static int __init
+calibrate_xor_blocks(void)
+{
+	void *b1, *b2;
+	struct xor_block_template *f, *fastest;
+
+	b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
+	if (!b1) {
+		printk(KERN_WARNING "xor: Yikes!  No memory available.\n");
+		return -ENOMEM;
+	}
+	b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
+
+	/*
+	 * If this arch/cpu has a short-circuited selection, don't loop through
+	 * all the possible functions, just test the best one
+	 */
+
+	fastest = NULL;
+
+#ifdef XOR_SELECT_TEMPLATE
+		fastest = XOR_SELECT_TEMPLATE(fastest);
+#endif
+
+#define xor_speed(templ)	do_xor_speed((templ), b1, b2)
+
+	if (fastest) {
+		printk(KERN_INFO "xor: automatically using best "
+			"checksumming function: %s\n",
+			fastest->name);
+		xor_speed(fastest);
+	} else {
+		printk(KERN_INFO "xor: measuring checksumming speed\n");
+		XOR_TRY_TEMPLATES;
+		fastest = template_list;
+		for (f = fastest; f; f = f->next)
+			if (f->speed > fastest->speed)
+				fastest = f;
+	}
+
+	printk(KERN_INFO "xor: using function: %s (%d.%03d MB/sec)\n",
+	       fastest->name, fastest->speed / 1000, fastest->speed % 1000);
+
+#undef xor_speed
+
+	free_pages((unsigned long)b1, 2);
+
+	active_template = fastest;
+	return 0;
+}
+
+static __exit void xor_exit(void) { }
+
+MODULE_LICENSE("GPL");
+
+/* when built-in xor.o must initialize before drivers/md/md.o */
+core_initcall(calibrate_xor_blocks);
+module_exit(xor_exit);
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 7df934d69134..24d93d02a1f3 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -109,6 +109,7 @@ config MD_RAID10
 config MD_RAID456
 	tristate "RAID-4/RAID-5/RAID-6 mode"
 	depends on BLK_DEV_MD
+	select XOR_BLOCKS
 	---help---
 	  A RAID-5 set of N drives with a capacity of C MB per drive provides
 	  the capacity of C * (N - 1) MB, and protects against a failure
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 38754084eac7..71eb45f74171 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -17,7 +17,7 @@ raid456-objs	:= raid5.o raid6algos.o raid6recov.o raid6tables.o \
 hostprogs-y	:= mktables
 
 # Note: link order is important.  All raid personalities
-# and xor.o must come before md.o, as they each initialise 
+# and must come before md.o, as they each initialise 
 # themselves, and md.o may use the personalities when it 
 # auto-initialised.
 
@@ -25,7 +25,7 @@ obj-$(CONFIG_MD_LINEAR)		+= linear.o
 obj-$(CONFIG_MD_RAID0)		+= raid0.o
 obj-$(CONFIG_MD_RAID1)		+= raid1.o
 obj-$(CONFIG_MD_RAID10)		+= raid10.o
-obj-$(CONFIG_MD_RAID456)	+= raid456.o xor.o
+obj-$(CONFIG_MD_RAID456)	+= raid456.o
 obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
 obj-$(CONFIG_MD_FAULTY)		+= faulty.o
 obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1c54f3c1cca7..33beaa7da085 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5814,7 +5814,7 @@ static __exit void md_exit(void)
 	}
 }
 
-module_init(md_init)
+subsys_initcall(md_init);
 module_exit(md_exit)
 
 static int get_ro(char *buffer, struct kernel_param *kp)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 061375ee6592..5adbe0b22684 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -918,7 +918,7 @@ static void copy_data(int frombio, struct bio *bio,
 
 #define check_xor() 	do { 						\
 			   if (count == MAX_XOR_BLOCKS) {		\
-				xor_block(count, STRIPE_SIZE, ptr);	\
+				xor_blocks(count, STRIPE_SIZE, ptr);	\
 				count = 1;				\
 			   }						\
 			} while(0)
@@ -949,7 +949,7 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
 		check_xor();
 	}
 	if (count != 1)
-		xor_block(count, STRIPE_SIZE, ptr);
+		xor_blocks(count, STRIPE_SIZE, ptr);
 	set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
 }
 
@@ -1004,7 +1004,7 @@ static void compute_parity5(struct stripe_head *sh, int method)
 		break;
 	}
 	if (count>1) {
-		xor_block(count, STRIPE_SIZE, ptr);
+		xor_blocks(count, STRIPE_SIZE, ptr);
 		count = 1;
 	}
 	
@@ -1038,7 +1038,7 @@ static void compute_parity5(struct stripe_head *sh, int method)
 			}
 	}
 	if (count != 1)
-		xor_block(count, STRIPE_SIZE, ptr);
+		xor_blocks(count, STRIPE_SIZE, ptr);
 	
 	if (method != CHECK_PARITY) {
 		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
@@ -1160,7 +1160,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
 			check_xor();
 		}
 		if (count != 1)
-			xor_block(count, STRIPE_SIZE, ptr);
+			xor_blocks(count, STRIPE_SIZE, ptr);
 		if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
 		else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
 	}
diff --git a/drivers/md/xor.c b/drivers/md/xor.c
deleted file mode 100644
index 324897c4be4e..000000000000
--- a/drivers/md/xor.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * xor.c : Multiple Devices driver for Linux
- *
- * Copyright (C) 1996, 1997, 1998, 1999, 2000,
- * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
- *
- * Dispatch optimized RAID-5 checksumming functions.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define BH_TRACE 0
-#include <linux/module.h>
-#include <linux/raid/md.h>
-#include <linux/raid/xor.h>
-#include <asm/xor.h>
-
-/* The xor routines to use.  */
-static struct xor_block_template *active_template;
-
-void
-xor_block(unsigned int count, unsigned int bytes, void **ptr)
-{
-	unsigned long *p0, *p1, *p2, *p3, *p4;
-
-	p0 = (unsigned long *) ptr[0];
-	p1 = (unsigned long *) ptr[1];
-	if (count == 2) {
-		active_template->do_2(bytes, p0, p1);
-		return;
-	}
-
-	p2 = (unsigned long *) ptr[2];
-	if (count == 3) {
-		active_template->do_3(bytes, p0, p1, p2);
-		return;
-	}
-
-	p3 = (unsigned long *) ptr[3];
-	if (count == 4) {
-		active_template->do_4(bytes, p0, p1, p2, p3);
-		return;
-	}
-
-	p4 = (unsigned long *) ptr[4];
-	active_template->do_5(bytes, p0, p1, p2, p3, p4);
-}
-
-/* Set of all registered templates.  */
-static struct xor_block_template *template_list;
-
-#define BENCH_SIZE (PAGE_SIZE)
-
-static void
-do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
-{
-	int speed;
-	unsigned long now;
-	int i, count, max;
-
-	tmpl->next = template_list;
-	template_list = tmpl;
-
-	/*
-	 * Count the number of XORs done during a whole jiffy, and use
-	 * this to calculate the speed of checksumming.  We use a 2-page
-	 * allocation to have guaranteed color L1-cache layout.
-	 */
-	max = 0;
-	for (i = 0; i < 5; i++) {
-		now = jiffies;
-		count = 0;
-		while (jiffies == now) {
-			mb();
-			tmpl->do_2(BENCH_SIZE, b1, b2);
-			mb();
-			count++;
-			mb();
-		}
-		if (count > max)
-			max = count;
-	}
-
-	speed = max * (HZ * BENCH_SIZE / 1024);
-	tmpl->speed = speed;
-
-	printk("   %-10s: %5d.%03d MB/sec\n", tmpl->name,
-	       speed / 1000, speed % 1000);
-}
-
-static int
-calibrate_xor_block(void)
-{
-	void *b1, *b2;
-	struct xor_block_template *f, *fastest;
-
-	b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
-	if (! b1) {
-		printk("raid5: Yikes!  No memory available.\n");
-		return -ENOMEM;
-	}
-	b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
-
-	/*
-	 * If this arch/cpu has a short-circuited selection, don't loop through all
-	 * the possible functions, just test the best one
-	 */
-
-	fastest = NULL;
-
-#ifdef XOR_SELECT_TEMPLATE
-		fastest = XOR_SELECT_TEMPLATE(fastest);
-#endif
-
-#define xor_speed(templ)	do_xor_speed((templ), b1, b2)
-
-	if (fastest) {
-		printk(KERN_INFO "raid5: automatically using best checksumming function: %s\n",
-			fastest->name);
-		xor_speed(fastest);
-	} else {
-		printk(KERN_INFO "raid5: measuring checksumming speed\n");
-		XOR_TRY_TEMPLATES;
-		fastest = template_list;
-		for (f = fastest; f; f = f->next)
-			if (f->speed > fastest->speed)
-				fastest = f;
-	}
-
-	printk("raid5: using function: %s (%d.%03d MB/sec)\n",
-	       fastest->name, fastest->speed / 1000, fastest->speed % 1000);
-
-#undef xor_speed
-
-	free_pages((unsigned long)b1, 2);
-
-	active_template = fastest;
-	return 0;
-}
-
-static __exit void xor_exit(void) { }
-
-EXPORT_SYMBOL(xor_block);
-MODULE_LICENSE("GPL");
-
-module_init(calibrate_xor_block);
-module_exit(xor_exit);
diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h
index f0d67cbdea40..7d6c20b654fa 100644
--- a/include/linux/raid/xor.h
+++ b/include/linux/raid/xor.h
@@ -5,7 +5,7 @@
 
 #define MAX_XOR_BLOCKS 5
 
-extern void xor_block(unsigned int count, unsigned int bytes, void **ptr);
+extern void xor_blocks(unsigned int count, unsigned int bytes, void **ptr);
 
 struct xor_block_template {
         struct xor_block_template *next;
-- 
cgit v1.3-8-gc7d7


From 9bc89cd82d6f88fb0ca39b30445c329a430fd66b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 2 Jan 2007 11:10:44 -0700
Subject: async_tx: add the async_tx api

The async_tx api provides methods for describing a chain of asynchronous
bulk memory transfers/transforms with support for inter-transactional
dependencies.  It is implemented as a dmaengine client that smooths over
the details of different hardware offload engine implementations.  Code
that is written to the api can optimize for asynchronous operation and the
api will fit the chain of operations to the available offload resources.

	I imagine that any piece of ADMA hardware would register with the
	'async_*' subsystem, and a call to async_X would be routed as
	appropriate, or be run in-line. - Neil Brown

async_tx exploits the capabilities of struct dma_async_tx_descriptor to
provide an api of the following general format:

struct dma_async_tx_descriptor *
async_<operation>(..., struct dma_async_tx_descriptor *depend_tx,
			dma_async_tx_callback cb_fn, void *cb_param)
{
	struct dma_chan *chan = async_tx_find_channel(depend_tx, <operation>);
	struct dma_device *device = chan ? chan->device : NULL;
	int int_en = cb_fn ? 1 : 0;
	struct dma_async_tx_descriptor *tx = device ?
		device->device_prep_dma_<operation>(chan, len, int_en) : NULL;

	if (tx) { /* run <operation> asynchronously */
		...
		tx->tx_set_dest(addr, tx, index);
		...
		tx->tx_set_src(addr, tx, index);
		...
		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
	} else { /* run <operation> synchronously */
		...
		<operation>
		...
		async_tx_sync_epilog(flags, depend_tx, cb_fn, cb_param);
	}

	return tx;
}

async_tx_find_channel() returns a capable channel from its pool.  The
channel pool is organized as a per-cpu array of channel pointers.  The
async_tx_rebalance() routine is tasked with managing these arrays.  In the
uniprocessor case async_tx_rebalance() tries to spread responsibility
evenly over channels of similar capabilities.  For example if there are two
copy+xor channels, one will handle copy operations and the other will
handle xor.  In the SMP case async_tx_rebalance() attempts to spread the
operations evenly over the cpus, e.g. cpu0 gets copy channel0 and xor
channel0 while cpu1 gets copy channel 1 and xor channel 1.  When a
dependency is specified async_tx_find_channel defaults to keeping the
operation on the same channel.  A xor->copy->xor chain will stay on one
channel if it supports both operation types, otherwise the transaction will
transition between a copy and a xor resource.

Currently the raid5 implementation in the MD raid456 driver has been
converted to the async_tx api.  A driver for the offload engines on the
Intel Xscale series of I/O processors, iop-adma, is provided in a later
commit.  With the iop-adma driver and async_tx, raid456 is able to offload
copy, xor, and xor-zero-sum operations to hardware engines.

On iop342 tiobench showed higher throughput for sequential writes (20 - 30%
improvement) and sequential reads to a degraded array (40 - 55%
improvement).  For the other cases performance was roughly equal, +/- a few
percentage points.  On a x86-smp platform the performance of the async_tx
implementation (in synchronous mode) was also +/- a few percentage points
of the original implementation.  According to 'top' on iop342 CPU
utilization drops from ~50% to ~15% during a 'resync' while the speed
according to /proc/mdstat doubles from ~25 MB/s to ~50 MB/s.

The tiobench command line used for testing was: tiobench --size 2048
--block 4096 --block 131072 --dir /mnt/raid --numruns 5
* iop342 had 1GB of memory available

Details:
* if CONFIG_DMA_ENGINE=n the asynchronous path is compiled away by making
  async_tx_find_channel a static inline routine that always returns NULL
* when a callback is specified for a given transaction an interrupt will
  fire at operation completion time and the callback will occur in a
  tasklet.  if the the channel does not support interrupts then a live
  polling wait will be performed
* the api is written as a dmaengine client that requests all available
  channels
* In support of dependencies the api implicitly schedules channel-switch
  interrupts.  The interrupt triggers the cleanup tasklet which causes
  pending operations to be scheduled on the next channel
* Xor engines treat an xor destination address differently than a software
  xor routine.  To the software routine the destination address is an implied
  source, whereas engines treat it as a write-only destination.  This patch
  modifies the xor_blocks routine to take a an explicit destination address
  to mirror the hardware.

Changelog:
* fixed a leftover debug print
* don't allow callbacks in async_interrupt_cond
* fixed xor_block changes
* fixed usage of ASYNC_TX_XOR_DROP_DEST
* drop dma mapping methods, suggested by Chris Leech
* printk warning fixups from Andrew Morton
* don't use inline in C files, Adrian Bunk
* select the API when MD is enabled
* BUG_ON xor source counts <= 1
* implicitly handle hardware concerns like channel switching and
  interrupts, Neil Brown
* remove the per operation type list, and distribute operation capabilities
  evenly amongst the available channels
* simplify async_tx_find_channel to optimize the fast path
* introduce the channel_table_initialized flag to prevent early calls to
  the api
* reorganize the code to mimic crypto
* include mm.h as not all archs include it in dma-mapping.h
* make the Kconfig options non-user visible, Adrian Bunk
* move async_tx under crypto since it is meant as 'core' functionality, and
  the two may share algorithms in the future
* move large inline functions into c files
* checkpatch.pl fixes
* gpl v2 only correction

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-By: NeilBrown <neilb@suse.de>
---
 crypto/Kconfig                 |   6 +-
 crypto/Makefile                |   2 +-
 crypto/async_tx/Kconfig        |  16 ++
 crypto/async_tx/Makefile       |   4 +
 crypto/async_tx/async_memcpy.c | 131 +++++++++++
 crypto/async_tx/async_memset.c | 109 +++++++++
 crypto/async_tx/async_tx.c     | 497 +++++++++++++++++++++++++++++++++++++++++
 crypto/async_tx/async_xor.c    | 327 +++++++++++++++++++++++++++
 crypto/xor.c                   |  29 ++-
 drivers/dma/Kconfig            |   5 +-
 drivers/md/Kconfig             |   3 +-
 drivers/md/raid5.c             |  54 ++---
 include/linux/async_tx.h       | 156 +++++++++++++
 include/linux/raid/xor.h       |   5 +-
 14 files changed, 1294 insertions(+), 50 deletions(-)
 create mode 100644 crypto/async_tx/Kconfig
 create mode 100644 crypto/async_tx/Makefile
 create mode 100644 crypto/async_tx/async_memcpy.c
 create mode 100644 crypto/async_tx/async_memset.c
 create mode 100644 crypto/async_tx/async_tx.c
 create mode 100644 crypto/async_tx/async_xor.c
 create mode 100644 include/linux/async_tx.h

(limited to 'include/linux')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index b749a1a46e22..07090e9f9bcf 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -5,9 +5,13 @@ config XOR_BLOCKS
 	tristate
 
 #
-# Cryptographic API Configuration
+# async_tx api: hardware offloaded memory transfer/transform support
 #
+source "crypto/async_tx/Kconfig"
 
+#
+# Cryptographic API Configuration
+#
 menu "Cryptographic options"
 
 config CRYPTO
diff --git a/crypto/Makefile b/crypto/Makefile
index 68e934b4bee2..0cf17f1ea151 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -55,4 +55,4 @@ obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o
 # generic algorithms and the async_tx api
 #
 obj-$(CONFIG_XOR_BLOCKS) += xor.o
-
+obj-$(CONFIG_ASYNC_CORE) += async_tx/
diff --git a/crypto/async_tx/Kconfig b/crypto/async_tx/Kconfig
new file mode 100644
index 000000000000..d8fb39145986
--- /dev/null
+++ b/crypto/async_tx/Kconfig
@@ -0,0 +1,16 @@
+config ASYNC_CORE
+	tristate
+
+config ASYNC_MEMCPY
+	tristate
+	select ASYNC_CORE
+
+config ASYNC_XOR
+	tristate
+	select ASYNC_CORE
+	select XOR_BLOCKS
+
+config ASYNC_MEMSET
+	tristate
+	select ASYNC_CORE
+
diff --git a/crypto/async_tx/Makefile b/crypto/async_tx/Makefile
new file mode 100644
index 000000000000..27baa7d52fbc
--- /dev/null
+++ b/crypto/async_tx/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_ASYNC_CORE) += async_tx.o
+obj-$(CONFIG_ASYNC_MEMCPY) += async_memcpy.o
+obj-$(CONFIG_ASYNC_MEMSET) += async_memset.o
+obj-$(CONFIG_ASYNC_XOR) += async_xor.o
diff --git a/crypto/async_tx/async_memcpy.c b/crypto/async_tx/async_memcpy.c
new file mode 100644
index 000000000000..a973f4ef897d
--- /dev/null
+++ b/crypto/async_tx/async_memcpy.c
@@ -0,0 +1,131 @@
+/*
+ * copy offload engine support
+ *
+ * Copyright © 2006, Intel Corporation.
+ *
+ *      Dan Williams <dan.j.williams@intel.com>
+ *
+ *      with architecture considerations by:
+ *      Neil Brown <neilb@suse.de>
+ *      Jeff Garzik <jeff@garzik.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/async_tx.h>
+
+/**
+ * async_memcpy - attempt to copy memory with a dma engine.
+ * @dest: destination page
+ * @src: src page
+ * @offset: offset in pages to start transaction
+ * @len: length in bytes
+ * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK,
+ *	ASYNC_TX_KMAP_SRC, ASYNC_TX_KMAP_DST
+ * @depend_tx: memcpy depends on the result of this transaction
+ * @cb_fn: function to call when the memcpy completes
+ * @cb_param: parameter to pass to the callback routine
+ */
+struct dma_async_tx_descriptor *
+async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
+	unsigned int src_offset, size_t len, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_MEMCPY);
+	struct dma_device *device = chan ? chan->device : NULL;
+	int int_en = cb_fn ? 1 : 0;
+	struct dma_async_tx_descriptor *tx = device ?
+		device->device_prep_dma_memcpy(chan, len,
+		int_en) : NULL;
+
+	if (tx) { /* run the memcpy asynchronously */
+		dma_addr_t addr;
+		enum dma_data_direction dir;
+
+		pr_debug("%s: (async) len: %zu\n", __FUNCTION__, len);
+
+		dir = (flags & ASYNC_TX_ASSUME_COHERENT) ?
+			DMA_NONE : DMA_FROM_DEVICE;
+
+		addr = dma_map_page(device->dev, dest, dest_offset, len, dir);
+		tx->tx_set_dest(addr, tx, 0);
+
+		dir = (flags & ASYNC_TX_ASSUME_COHERENT) ?
+			DMA_NONE : DMA_TO_DEVICE;
+
+		addr = dma_map_page(device->dev, src, src_offset, len, dir);
+		tx->tx_set_src(addr, tx, 0);
+
+		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+	} else { /* run the memcpy synchronously */
+		void *dest_buf, *src_buf;
+		pr_debug("%s: (sync) len: %zu\n", __FUNCTION__, len);
+
+		/* wait for any prerequisite operations */
+		if (depend_tx) {
+			/* if ack is already set then we cannot be sure
+			 * we are referring to the correct operation
+			 */
+			BUG_ON(depend_tx->ack);
+			if (dma_wait_for_async_tx(depend_tx) == DMA_ERROR)
+				panic("%s: DMA_ERROR waiting for depend_tx\n",
+					__FUNCTION__);
+		}
+
+		if (flags & ASYNC_TX_KMAP_DST)
+			dest_buf = kmap_atomic(dest, KM_USER0) + dest_offset;
+		else
+			dest_buf = page_address(dest) + dest_offset;
+
+		if (flags & ASYNC_TX_KMAP_SRC)
+			src_buf = kmap_atomic(src, KM_USER0) + src_offset;
+		else
+			src_buf = page_address(src) + src_offset;
+
+		memcpy(dest_buf, src_buf, len);
+
+		if (flags & ASYNC_TX_KMAP_DST)
+			kunmap_atomic(dest_buf, KM_USER0);
+
+		if (flags & ASYNC_TX_KMAP_SRC)
+			kunmap_atomic(src_buf, KM_USER0);
+
+		async_tx_sync_epilog(flags, depend_tx, cb_fn, cb_param);
+	}
+
+	return tx;
+}
+EXPORT_SYMBOL_GPL(async_memcpy);
+
+static int __init async_memcpy_init(void)
+{
+	return 0;
+}
+
+static void __exit async_memcpy_exit(void)
+{
+	do { } while (0);
+}
+
+module_init(async_memcpy_init);
+module_exit(async_memcpy_exit);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("asynchronous memcpy api");
+MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_memset.c b/crypto/async_tx/async_memset.c
new file mode 100644
index 000000000000..66ef6351202e
--- /dev/null
+++ b/crypto/async_tx/async_memset.c
@@ -0,0 +1,109 @@
+/*
+ * memory fill offload engine support
+ *
+ * Copyright © 2006, Intel Corporation.
+ *
+ *      Dan Williams <dan.j.williams@intel.com>
+ *
+ *      with architecture considerations by:
+ *      Neil Brown <neilb@suse.de>
+ *      Jeff Garzik <jeff@garzik.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/async_tx.h>
+
+/**
+ * async_memset - attempt to fill memory with a dma engine.
+ * @dest: destination page
+ * @val: fill value
+ * @offset: offset in pages to start transaction
+ * @len: length in bytes
+ * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
+ * @depend_tx: memset depends on the result of this transaction
+ * @cb_fn: function to call when the memcpy completes
+ * @cb_param: parameter to pass to the callback routine
+ */
+struct dma_async_tx_descriptor *
+async_memset(struct page *dest, int val, unsigned int offset,
+	size_t len, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_MEMSET);
+	struct dma_device *device = chan ? chan->device : NULL;
+	int int_en = cb_fn ? 1 : 0;
+	struct dma_async_tx_descriptor *tx = device ?
+		device->device_prep_dma_memset(chan, val, len,
+			int_en) : NULL;
+
+	if (tx) { /* run the memset asynchronously */
+		dma_addr_t dma_addr;
+		enum dma_data_direction dir;
+
+		pr_debug("%s: (async) len: %zu\n", __FUNCTION__, len);
+		dir = (flags & ASYNC_TX_ASSUME_COHERENT) ?
+			DMA_NONE : DMA_FROM_DEVICE;
+
+		dma_addr = dma_map_page(device->dev, dest, offset, len, dir);
+		tx->tx_set_dest(dma_addr, tx, 0);
+
+		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+	} else { /* run the memset synchronously */
+		void *dest_buf;
+		pr_debug("%s: (sync) len: %zu\n", __FUNCTION__, len);
+
+		dest_buf = (void *) (((char *) page_address(dest)) + offset);
+
+		/* wait for any prerequisite operations */
+		if (depend_tx) {
+			/* if ack is already set then we cannot be sure
+			 * we are referring to the correct operation
+			 */
+			BUG_ON(depend_tx->ack);
+			if (dma_wait_for_async_tx(depend_tx) == DMA_ERROR)
+				panic("%s: DMA_ERROR waiting for depend_tx\n",
+					__FUNCTION__);
+		}
+
+		memset(dest_buf, val, len);
+
+		async_tx_sync_epilog(flags, depend_tx, cb_fn, cb_param);
+	}
+
+	return tx;
+}
+EXPORT_SYMBOL_GPL(async_memset);
+
+static int __init async_memset_init(void)
+{
+	return 0;
+}
+
+static void __exit async_memset_exit(void)
+{
+	do { } while (0);
+}
+
+module_init(async_memset_init);
+module_exit(async_memset_exit);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("asynchronous memset api");
+MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
new file mode 100644
index 000000000000..035007145e78
--- /dev/null
+++ b/crypto/async_tx/async_tx.c
@@ -0,0 +1,497 @@
+/*
+ * core routines for the asynchronous memory transfer/transform api
+ *
+ * Copyright © 2006, Intel Corporation.
+ *
+ *	Dan Williams <dan.j.williams@intel.com>
+ *
+ *	with architecture considerations by:
+ *	Neil Brown <neilb@suse.de>
+ *	Jeff Garzik <jeff@garzik.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/async_tx.h>
+
+#ifdef CONFIG_DMA_ENGINE
+static enum dma_state_client
+dma_channel_add_remove(struct dma_client *client,
+	struct dma_chan *chan, enum dma_state state);
+
+static struct dma_client async_tx_dma = {
+	.event_callback = dma_channel_add_remove,
+	/* .cap_mask == 0 defaults to all channels */
+};
+
+/**
+ * dma_cap_mask_all - enable iteration over all operation types
+ */
+static dma_cap_mask_t dma_cap_mask_all;
+
+/**
+ * chan_ref_percpu - tracks channel allocations per core/opertion
+ */
+struct chan_ref_percpu {
+	struct dma_chan_ref *ref;
+};
+
+static int channel_table_initialized;
+static struct chan_ref_percpu *channel_table[DMA_TX_TYPE_END];
+
+/**
+ * async_tx_lock - protect modification of async_tx_master_list and serialize
+ *	rebalance operations
+ */
+static spinlock_t async_tx_lock;
+
+static struct list_head
+async_tx_master_list = LIST_HEAD_INIT(async_tx_master_list);
+
+/* async_tx_issue_pending_all - start all transactions on all channels */
+void async_tx_issue_pending_all(void)
+{
+	struct dma_chan_ref *ref;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ref, &async_tx_master_list, node)
+		ref->chan->device->device_issue_pending(ref->chan);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(async_tx_issue_pending_all);
+
+/* dma_wait_for_async_tx - spin wait for a transcation to complete
+ * @tx: transaction to wait on
+ */
+enum dma_status
+dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
+{
+	enum dma_status status;
+	struct dma_async_tx_descriptor *iter;
+
+	if (!tx)
+		return DMA_SUCCESS;
+
+	/* poll through the dependency chain, return when tx is complete */
+	do {
+		iter = tx;
+		while (iter->cookie == -EBUSY)
+			iter = iter->parent;
+
+		status = dma_sync_wait(iter->chan, iter->cookie);
+	} while (status == DMA_IN_PROGRESS || (iter != tx));
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(dma_wait_for_async_tx);
+
+/* async_tx_run_dependencies - helper routine for dma drivers to process
+ *	(start) dependent operations on their target channel
+ * @tx: transaction with dependencies
+ */
+void
+async_tx_run_dependencies(struct dma_async_tx_descriptor *tx)
+{
+	struct dma_async_tx_descriptor *dep_tx, *_dep_tx;
+	struct dma_device *dev;
+	struct dma_chan *chan;
+
+	list_for_each_entry_safe(dep_tx, _dep_tx, &tx->depend_list,
+		depend_node) {
+		chan = dep_tx->chan;
+		dev = chan->device;
+		/* we can't depend on ourselves */
+		BUG_ON(chan == tx->chan);
+		list_del(&dep_tx->depend_node);
+		tx->tx_submit(dep_tx);
+
+		/* we need to poke the engine as client code does not
+		 * know about dependency submission events
+		 */
+		dev->device_issue_pending(chan);
+	}
+}
+EXPORT_SYMBOL_GPL(async_tx_run_dependencies);
+
+static void
+free_dma_chan_ref(struct rcu_head *rcu)
+{
+	struct dma_chan_ref *ref;
+	ref = container_of(rcu, struct dma_chan_ref, rcu);
+	kfree(ref);
+}
+
+static void
+init_dma_chan_ref(struct dma_chan_ref *ref, struct dma_chan *chan)
+{
+	INIT_LIST_HEAD(&ref->node);
+	INIT_RCU_HEAD(&ref->rcu);
+	ref->chan = chan;
+	atomic_set(&ref->count, 0);
+}
+
+/**
+ * get_chan_ref_by_cap - returns the nth channel of the given capability
+ * 	defaults to returning the channel with the desired capability and the
+ * 	lowest reference count if the index can not be satisfied
+ * @cap: capability to match
+ * @index: nth channel desired, passing -1 has the effect of forcing the
+ *  default return value
+ */
+static struct dma_chan_ref *
+get_chan_ref_by_cap(enum dma_transaction_type cap, int index)
+{
+	struct dma_chan_ref *ret_ref = NULL, *min_ref = NULL, *ref;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ref, &async_tx_master_list, node)
+		if (dma_has_cap(cap, ref->chan->device->cap_mask)) {
+			if (!min_ref)
+				min_ref = ref;
+			else if (atomic_read(&ref->count) <
+				atomic_read(&min_ref->count))
+				min_ref = ref;
+
+			if (index-- == 0) {
+				ret_ref = ref;
+				break;
+			}
+		}
+	rcu_read_unlock();
+
+	if (!ret_ref)
+		ret_ref = min_ref;
+
+	if (ret_ref)
+		atomic_inc(&ret_ref->count);
+
+	return ret_ref;
+}
+
+/**
+ * async_tx_rebalance - redistribute the available channels, optimize
+ * for cpu isolation in the SMP case, and opertaion isolation in the
+ * uniprocessor case
+ */
+static void async_tx_rebalance(void)
+{
+	int cpu, cap, cpu_idx = 0;
+	unsigned long flags;
+
+	if (!channel_table_initialized)
+		return;
+
+	spin_lock_irqsave(&async_tx_lock, flags);
+
+	/* undo the last distribution */
+	for_each_dma_cap_mask(cap, dma_cap_mask_all)
+		for_each_possible_cpu(cpu) {
+			struct dma_chan_ref *ref =
+				per_cpu_ptr(channel_table[cap], cpu)->ref;
+			if (ref) {
+				atomic_set(&ref->count, 0);
+				per_cpu_ptr(channel_table[cap], cpu)->ref =
+									NULL;
+			}
+		}
+
+	for_each_dma_cap_mask(cap, dma_cap_mask_all)
+		for_each_online_cpu(cpu) {
+			struct dma_chan_ref *new;
+			if (NR_CPUS > 1)
+				new = get_chan_ref_by_cap(cap, cpu_idx++);
+			else
+				new = get_chan_ref_by_cap(cap, -1);
+
+			per_cpu_ptr(channel_table[cap], cpu)->ref = new;
+		}
+
+	spin_unlock_irqrestore(&async_tx_lock, flags);
+}
+
+static enum dma_state_client
+dma_channel_add_remove(struct dma_client *client,
+	struct dma_chan *chan, enum dma_state state)
+{
+	unsigned long found, flags;
+	struct dma_chan_ref *master_ref, *ref;
+	enum dma_state_client ack = DMA_DUP; /* default: take no action */
+
+	switch (state) {
+	case DMA_RESOURCE_AVAILABLE:
+		found = 0;
+		rcu_read_lock();
+		list_for_each_entry_rcu(ref, &async_tx_master_list, node)
+			if (ref->chan == chan) {
+				found = 1;
+				break;
+			}
+		rcu_read_unlock();
+
+		pr_debug("async_tx: dma resource available [%s]\n",
+			found ? "old" : "new");
+
+		if (!found)
+			ack = DMA_ACK;
+		else
+			break;
+
+		/* add the channel to the generic management list */
+		master_ref = kmalloc(sizeof(*master_ref), GFP_KERNEL);
+		if (master_ref) {
+			/* keep a reference until async_tx is unloaded */
+			dma_chan_get(chan);
+			init_dma_chan_ref(master_ref, chan);
+			spin_lock_irqsave(&async_tx_lock, flags);
+			list_add_tail_rcu(&master_ref->node,
+				&async_tx_master_list);
+			spin_unlock_irqrestore(&async_tx_lock,
+				flags);
+		} else {
+			printk(KERN_WARNING "async_tx: unable to create"
+				" new master entry in response to"
+				" a DMA_RESOURCE_ADDED event"
+				" (-ENOMEM)\n");
+			return 0;
+		}
+
+		async_tx_rebalance();
+		break;
+	case DMA_RESOURCE_REMOVED:
+		found = 0;
+		spin_lock_irqsave(&async_tx_lock, flags);
+		list_for_each_entry_rcu(ref, &async_tx_master_list, node)
+			if (ref->chan == chan) {
+				/* permit backing devices to go away */
+				dma_chan_put(ref->chan);
+				list_del_rcu(&ref->node);
+				call_rcu(&ref->rcu, free_dma_chan_ref);
+				found = 1;
+				break;
+			}
+		spin_unlock_irqrestore(&async_tx_lock, flags);
+
+		pr_debug("async_tx: dma resource removed [%s]\n",
+			found ? "ours" : "not ours");
+
+		if (found)
+			ack = DMA_ACK;
+		else
+			break;
+
+		async_tx_rebalance();
+		break;
+	case DMA_RESOURCE_SUSPEND:
+	case DMA_RESOURCE_RESUME:
+		printk(KERN_WARNING "async_tx: does not support dma channel"
+			" suspend/resume\n");
+		break;
+	default:
+		BUG();
+	}
+
+	return ack;
+}
+
+static int __init
+async_tx_init(void)
+{
+	enum dma_transaction_type cap;
+
+	spin_lock_init(&async_tx_lock);
+	bitmap_fill(dma_cap_mask_all.bits, DMA_TX_TYPE_END);
+
+	/* an interrupt will never be an explicit operation type.
+	 * clearing this bit prevents allocation to a slot in 'channel_table'
+	 */
+	clear_bit(DMA_INTERRUPT, dma_cap_mask_all.bits);
+
+	for_each_dma_cap_mask(cap, dma_cap_mask_all) {
+		channel_table[cap] = alloc_percpu(struct chan_ref_percpu);
+		if (!channel_table[cap])
+			goto err;
+	}
+
+	channel_table_initialized = 1;
+	dma_async_client_register(&async_tx_dma);
+	dma_async_client_chan_request(&async_tx_dma);
+
+	printk(KERN_INFO "async_tx: api initialized (async)\n");
+
+	return 0;
+err:
+	printk(KERN_ERR "async_tx: initialization failure\n");
+
+	while (--cap >= 0)
+		free_percpu(channel_table[cap]);
+
+	return 1;
+}
+
+static void __exit async_tx_exit(void)
+{
+	enum dma_transaction_type cap;
+
+	channel_table_initialized = 0;
+
+	for_each_dma_cap_mask(cap, dma_cap_mask_all)
+		if (channel_table[cap])
+			free_percpu(channel_table[cap]);
+
+	dma_async_client_unregister(&async_tx_dma);
+}
+
+/**
+ * async_tx_find_channel - find a channel to carry out the operation or let
+ *	the transaction execute synchronously
+ * @depend_tx: transaction dependency
+ * @tx_type: transaction type
+ */
+struct dma_chan *
+async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
+	enum dma_transaction_type tx_type)
+{
+	/* see if we can keep the chain on one channel */
+	if (depend_tx &&
+		dma_has_cap(tx_type, depend_tx->chan->device->cap_mask))
+		return depend_tx->chan;
+	else if (likely(channel_table_initialized)) {
+		struct dma_chan_ref *ref;
+		int cpu = get_cpu();
+		ref = per_cpu_ptr(channel_table[tx_type], cpu)->ref;
+		put_cpu();
+		return ref ? ref->chan : NULL;
+	} else
+		return NULL;
+}
+EXPORT_SYMBOL_GPL(async_tx_find_channel);
+#else
+static int __init async_tx_init(void)
+{
+	printk(KERN_INFO "async_tx: api initialized (sync-only)\n");
+	return 0;
+}
+
+static void __exit async_tx_exit(void)
+{
+	do { } while (0);
+}
+#endif
+
+void
+async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
+	enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	tx->callback = cb_fn;
+	tx->callback_param = cb_param;
+
+	/* set this new tx to run after depend_tx if:
+	 * 1/ a dependency exists (depend_tx is !NULL)
+	 * 2/ the tx can not be submitted to the current channel
+	 */
+	if (depend_tx && depend_tx->chan != chan) {
+		/* if ack is already set then we cannot be sure
+		 * we are referring to the correct operation
+		 */
+		BUG_ON(depend_tx->ack);
+
+		tx->parent = depend_tx;
+		spin_lock_bh(&depend_tx->lock);
+		list_add_tail(&tx->depend_node, &depend_tx->depend_list);
+		if (depend_tx->cookie == 0) {
+			struct dma_chan *dep_chan = depend_tx->chan;
+			struct dma_device *dep_dev = dep_chan->device;
+			dep_dev->device_dependency_added(dep_chan);
+		}
+		spin_unlock_bh(&depend_tx->lock);
+
+		/* schedule an interrupt to trigger the channel switch */
+		async_trigger_callback(ASYNC_TX_ACK, depend_tx, NULL, NULL);
+	} else {
+		tx->parent = NULL;
+		tx->tx_submit(tx);
+	}
+
+	if (flags & ASYNC_TX_ACK)
+		async_tx_ack(tx);
+
+	if (depend_tx && (flags & ASYNC_TX_DEP_ACK))
+		async_tx_ack(depend_tx);
+}
+EXPORT_SYMBOL_GPL(async_tx_submit);
+
+/**
+ * async_trigger_callback - schedules the callback function to be run after
+ * any dependent operations have been completed.
+ * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
+ * @depend_tx: 'callback' requires the completion of this transaction
+ * @cb_fn: function to call after depend_tx completes
+ * @cb_param: parameter to pass to the callback routine
+ */
+struct dma_async_tx_descriptor *
+async_trigger_callback(enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	struct dma_chan *chan;
+	struct dma_device *device;
+	struct dma_async_tx_descriptor *tx;
+
+	if (depend_tx) {
+		chan = depend_tx->chan;
+		device = chan->device;
+
+		/* see if we can schedule an interrupt
+		 * otherwise poll for completion
+		 */
+		if (device && !dma_has_cap(DMA_INTERRUPT, device->cap_mask))
+			device = NULL;
+
+		tx = device ? device->device_prep_dma_interrupt(chan) : NULL;
+	} else
+		tx = NULL;
+
+	if (tx) {
+		pr_debug("%s: (async)\n", __FUNCTION__);
+
+		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+	} else {
+		pr_debug("%s: (sync)\n", __FUNCTION__);
+
+		/* wait for any prerequisite operations */
+		if (depend_tx) {
+			/* if ack is already set then we cannot be sure
+			 * we are referring to the correct operation
+			 */
+			BUG_ON(depend_tx->ack);
+			if (dma_wait_for_async_tx(depend_tx) == DMA_ERROR)
+				panic("%s: DMA_ERROR waiting for depend_tx\n",
+					__FUNCTION__);
+		}
+
+		async_tx_sync_epilog(flags, depend_tx, cb_fn, cb_param);
+	}
+
+	return tx;
+}
+EXPORT_SYMBOL_GPL(async_trigger_callback);
+
+module_init(async_tx_init);
+module_exit(async_tx_exit);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("Asynchronous Bulk Memory Transactions API");
+MODULE_LICENSE("GPL");
diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c
new file mode 100644
index 000000000000..2575f674dcd5
--- /dev/null
+++ b/crypto/async_tx/async_xor.c
@@ -0,0 +1,327 @@
+/*
+ * xor offload engine api
+ *
+ * Copyright © 2006, Intel Corporation.
+ *
+ *      Dan Williams <dan.j.williams@intel.com>
+ *
+ *      with architecture considerations by:
+ *      Neil Brown <neilb@suse.de>
+ *      Jeff Garzik <jeff@garzik.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/raid/xor.h>
+#include <linux/async_tx.h>
+
+static void
+do_async_xor(struct dma_async_tx_descriptor *tx, struct dma_device *device,
+	struct dma_chan *chan, struct page *dest, struct page **src_list,
+	unsigned int offset, unsigned int src_cnt, size_t len,
+	enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	dma_addr_t dma_addr;
+	enum dma_data_direction dir;
+	int i;
+
+	pr_debug("%s: len: %zu\n", __FUNCTION__, len);
+
+	dir = (flags & ASYNC_TX_ASSUME_COHERENT) ?
+		DMA_NONE : DMA_FROM_DEVICE;
+
+	dma_addr = dma_map_page(device->dev, dest, offset, len, dir);
+	tx->tx_set_dest(dma_addr, tx, 0);
+
+	dir = (flags & ASYNC_TX_ASSUME_COHERENT) ?
+		DMA_NONE : DMA_TO_DEVICE;
+
+	for (i = 0; i < src_cnt; i++) {
+		dma_addr = dma_map_page(device->dev, src_list[i],
+			offset, len, dir);
+		tx->tx_set_src(dma_addr, tx, i);
+	}
+
+	async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+}
+
+static void
+do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
+	unsigned int src_cnt, size_t len, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	void *_dest;
+	int i;
+
+	pr_debug("%s: len: %zu\n", __FUNCTION__, len);
+
+	/* reuse the 'src_list' array to convert to buffer pointers */
+	for (i = 0; i < src_cnt; i++)
+		src_list[i] = (struct page *)
+			(page_address(src_list[i]) + offset);
+
+	/* set destination address */
+	_dest = page_address(dest) + offset;
+
+	if (flags & ASYNC_TX_XOR_ZERO_DST)
+		memset(_dest, 0, len);
+
+	xor_blocks(src_cnt, len, _dest,
+		(void **) src_list);
+
+	async_tx_sync_epilog(flags, depend_tx, cb_fn, cb_param);
+}
+
+/**
+ * async_xor - attempt to xor a set of blocks with a dma engine.
+ *	xor_blocks always uses the dest as a source so the ASYNC_TX_XOR_ZERO_DST
+ *	flag must be set to not include dest data in the calculation.  The
+ *	assumption with dma eninges is that they only use the destination
+ *	buffer as a source when it is explicity specified in the source list.
+ * @dest: destination page
+ * @src_list: array of source pages (if the dest is also a source it must be
+ *	at index zero).  The contents of this array may be overwritten.
+ * @offset: offset in pages to start transaction
+ * @src_cnt: number of source pages
+ * @len: length in bytes
+ * @flags: ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DEST,
+ *	ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
+ * @depend_tx: xor depends on the result of this transaction.
+ * @cb_fn: function to call when the xor completes
+ * @cb_param: parameter to pass to the callback routine
+ */
+struct dma_async_tx_descriptor *
+async_xor(struct page *dest, struct page **src_list, unsigned int offset,
+	int src_cnt, size_t len, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_XOR);
+	struct dma_device *device = chan ? chan->device : NULL;
+	struct dma_async_tx_descriptor *tx = NULL;
+	dma_async_tx_callback _cb_fn;
+	void *_cb_param;
+	unsigned long local_flags;
+	int xor_src_cnt;
+	int i = 0, src_off = 0, int_en;
+
+	BUG_ON(src_cnt <= 1);
+
+	while (src_cnt) {
+		local_flags = flags;
+		if (device) { /* run the xor asynchronously */
+			xor_src_cnt = min(src_cnt, device->max_xor);
+			/* if we are submitting additional xors
+			 * only set the callback on the last transaction
+			 */
+			if (src_cnt > xor_src_cnt) {
+				local_flags &= ~ASYNC_TX_ACK;
+				_cb_fn = NULL;
+				_cb_param = NULL;
+			} else {
+				_cb_fn = cb_fn;
+				_cb_param = cb_param;
+			}
+
+			int_en = _cb_fn ? 1 : 0;
+
+			tx = device->device_prep_dma_xor(
+				chan, xor_src_cnt, len, int_en);
+
+			if (tx) {
+				do_async_xor(tx, device, chan, dest,
+				&src_list[src_off], offset, xor_src_cnt, len,
+				local_flags, depend_tx, _cb_fn,
+				_cb_param);
+			} else /* fall through */
+				goto xor_sync;
+		} else { /* run the xor synchronously */
+xor_sync:
+			/* in the sync case the dest is an implied source
+			 * (assumes the dest is at the src_off index)
+			 */
+			if (flags & ASYNC_TX_XOR_DROP_DST) {
+				src_cnt--;
+				src_off++;
+			}
+
+			/* process up to 'MAX_XOR_BLOCKS' sources */
+			xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
+
+			/* if we are submitting additional xors
+			 * only set the callback on the last transaction
+			 */
+			if (src_cnt > xor_src_cnt) {
+				local_flags &= ~ASYNC_TX_ACK;
+				_cb_fn = NULL;
+				_cb_param = NULL;
+			} else {
+				_cb_fn = cb_fn;
+				_cb_param = cb_param;
+			}
+
+			/* wait for any prerequisite operations */
+			if (depend_tx) {
+				/* if ack is already set then we cannot be sure
+				 * we are referring to the correct operation
+				 */
+				BUG_ON(depend_tx->ack);
+				if (dma_wait_for_async_tx(depend_tx) ==
+					DMA_ERROR)
+					panic("%s: DMA_ERROR waiting for "
+						"depend_tx\n",
+						__FUNCTION__);
+			}
+
+			do_sync_xor(dest, &src_list[src_off], offset,
+				xor_src_cnt, len, local_flags, depend_tx,
+				_cb_fn, _cb_param);
+		}
+
+		/* the previous tx is hidden from the client,
+		 * so ack it
+		 */
+		if (i && depend_tx)
+			async_tx_ack(depend_tx);
+
+		depend_tx = tx;
+
+		if (src_cnt > xor_src_cnt) {
+			/* drop completed sources */
+			src_cnt -= xor_src_cnt;
+			src_off += xor_src_cnt;
+
+			/* unconditionally preserve the destination */
+			flags &= ~ASYNC_TX_XOR_ZERO_DST;
+
+			/* use the intermediate result a source, but remember
+			 * it's dropped, because it's implied, in the sync case
+			 */
+			src_list[--src_off] = dest;
+			src_cnt++;
+			flags |= ASYNC_TX_XOR_DROP_DST;
+		} else
+			src_cnt = 0;
+		i++;
+	}
+
+	return tx;
+}
+EXPORT_SYMBOL_GPL(async_xor);
+
+static int page_is_zero(struct page *p, unsigned int offset, size_t len)
+{
+	char *a = page_address(p) + offset;
+	return ((*(u32 *) a) == 0 &&
+		memcmp(a, a + 4, len - 4) == 0);
+}
+
+/**
+ * async_xor_zero_sum - attempt a xor parity check with a dma engine.
+ * @dest: destination page used if the xor is performed synchronously
+ * @src_list: array of source pages.  The dest page must be listed as a source
+ * 	at index zero.  The contents of this array may be overwritten.
+ * @offset: offset in pages to start transaction
+ * @src_cnt: number of source pages
+ * @len: length in bytes
+ * @result: 0 if sum == 0 else non-zero
+ * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK
+ * @depend_tx: xor depends on the result of this transaction.
+ * @cb_fn: function to call when the xor completes
+ * @cb_param: parameter to pass to the callback routine
+ */
+struct dma_async_tx_descriptor *
+async_xor_zero_sum(struct page *dest, struct page **src_list,
+	unsigned int offset, int src_cnt, size_t len,
+	u32 *result, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_param)
+{
+	struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_ZERO_SUM);
+	struct dma_device *device = chan ? chan->device : NULL;
+	int int_en = cb_fn ? 1 : 0;
+	struct dma_async_tx_descriptor *tx = device ?
+		device->device_prep_dma_zero_sum(chan, src_cnt, len, result,
+			int_en) : NULL;
+	int i;
+
+	BUG_ON(src_cnt <= 1);
+
+	if (tx) {
+		dma_addr_t dma_addr;
+		enum dma_data_direction dir;
+
+		pr_debug("%s: (async) len: %zu\n", __FUNCTION__, len);
+
+		dir = (flags & ASYNC_TX_ASSUME_COHERENT) ?
+			DMA_NONE : DMA_TO_DEVICE;
+
+		for (i = 0; i < src_cnt; i++) {
+			dma_addr = dma_map_page(device->dev, src_list[i],
+				offset, len, dir);
+			tx->tx_set_src(dma_addr, tx, i);
+		}
+
+		async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param);
+	} else {
+		unsigned long xor_flags = flags;
+
+		pr_debug("%s: (sync) len: %zu\n", __FUNCTION__, len);
+
+		xor_flags |= ASYNC_TX_XOR_DROP_DST;
+		xor_flags &= ~ASYNC_TX_ACK;
+
+		tx = async_xor(dest, src_list, offset, src_cnt, len, xor_flags,
+			depend_tx, NULL, NULL);
+
+		if (tx) {
+			if (dma_wait_for_async_tx(tx) == DMA_ERROR)
+				panic("%s: DMA_ERROR waiting for tx\n",
+					__FUNCTION__);
+			async_tx_ack(tx);
+		}
+
+		*result = page_is_zero(dest, offset, len) ? 0 : 1;
+
+		tx = NULL;
+
+		async_tx_sync_epilog(flags, depend_tx, cb_fn, cb_param);
+	}
+
+	return tx;
+}
+EXPORT_SYMBOL_GPL(async_xor_zero_sum);
+
+static int __init async_xor_init(void)
+{
+	return 0;
+}
+
+static void __exit async_xor_exit(void)
+{
+	do { } while (0);
+}
+
+module_init(async_xor_init);
+module_exit(async_xor_exit);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("asynchronous xor/xor-zero-sum api");
+MODULE_LICENSE("GPL");
diff --git a/crypto/xor.c b/crypto/xor.c
index 8281ac5e68a8..b2e6db075e49 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -26,31 +26,30 @@
 static struct xor_block_template *active_template;
 
 void
-xor_blocks(unsigned int count, unsigned int bytes, void **ptr)
+xor_blocks(unsigned int src_count, unsigned int bytes, void *dest, void **srcs)
 {
-	unsigned long *p0, *p1, *p2, *p3, *p4;
+	unsigned long *p1, *p2, *p3, *p4;
 
-	p0 = (unsigned long *) ptr[0];
-	p1 = (unsigned long *) ptr[1];
-	if (count == 2) {
-		active_template->do_2(bytes, p0, p1);
+	p1 = (unsigned long *) srcs[0];
+	if (src_count == 1) {
+		active_template->do_2(bytes, dest, p1);
 		return;
 	}
 
-	p2 = (unsigned long *) ptr[2];
-	if (count == 3) {
-		active_template->do_3(bytes, p0, p1, p2);
+	p2 = (unsigned long *) srcs[1];
+	if (src_count == 2) {
+		active_template->do_3(bytes, dest, p1, p2);
 		return;
 	}
 
-	p3 = (unsigned long *) ptr[3];
-	if (count == 4) {
-		active_template->do_4(bytes, p0, p1, p2, p3);
+	p3 = (unsigned long *) srcs[2];
+	if (src_count == 3) {
+		active_template->do_4(bytes, dest, p1, p2, p3);
 		return;
 	}
 
-	p4 = (unsigned long *) ptr[4];
-	active_template->do_5(bytes, p0, p1, p2, p3, p4);
+	p4 = (unsigned long *) srcs[3];
+	active_template->do_5(bytes, dest, p1, p2, p3, p4);
 }
 EXPORT_SYMBOL(xor_blocks);
 
@@ -128,7 +127,7 @@ calibrate_xor_blocks(void)
 			fastest->name);
 		xor_speed(fastest);
 	} else {
-		printk(KERN_INFO "xor: measuring checksumming speed\n");
+		printk(KERN_INFO "xor: measuring software checksum speed\n");
 		XOR_TRY_TEMPLATES;
 		fastest = template_list;
 		for (f = fastest; f; f = f->next)
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 72be6c63edfc..492aa080562d 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -8,8 +8,8 @@ menu "DMA Engine support"
 config DMA_ENGINE
 	bool "Support for DMA engines"
 	---help---
-	  DMA engines offload copy operations from the CPU to dedicated
-	  hardware, allowing the copies to happen asynchronously.
+          DMA engines offload bulk memory operations from the CPU to dedicated
+          hardware, allowing the operations to happen asynchronously.
 
 comment "DMA Clients"
 
@@ -31,5 +31,4 @@ config INTEL_IOATDMA
 	default m
 	---help---
 	  Enable support for the Intel(R) I/OAT DMA engine.
-
 endmenu
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 24d93d02a1f3..bfd9b9c6252c 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -109,7 +109,8 @@ config MD_RAID10
 config MD_RAID456
 	tristate "RAID-4/RAID-5/RAID-6 mode"
 	depends on BLK_DEV_MD
-	select XOR_BLOCKS
+	select ASYNC_MEMCPY
+	select ASYNC_XOR
 	---help---
 	  A RAID-5 set of N drives with a capacity of C MB per drive provides
 	  the capacity of C * (N - 1) MB, and protects against a failure
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5adbe0b22684..4f51dfa8e487 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -916,25 +916,25 @@ static void copy_data(int frombio, struct bio *bio,
 	}
 }
 
-#define check_xor() 	do { 						\
-			   if (count == MAX_XOR_BLOCKS) {		\
-				xor_blocks(count, STRIPE_SIZE, ptr);	\
-				count = 1;				\
-			   }						\
+#define check_xor()	do {						  \
+				if (count == MAX_XOR_BLOCKS) {		  \
+				xor_blocks(count, STRIPE_SIZE, dest, ptr);\
+				count = 0;				  \
+			   }						  \
 			} while(0)
 
 
 static void compute_block(struct stripe_head *sh, int dd_idx)
 {
 	int i, count, disks = sh->disks;
-	void *ptr[MAX_XOR_BLOCKS], *p;
+	void *ptr[MAX_XOR_BLOCKS], *dest, *p;
 
 	PRINTK("compute_block, stripe %llu, idx %d\n", 
 		(unsigned long long)sh->sector, dd_idx);
 
-	ptr[0] = page_address(sh->dev[dd_idx].page);
-	memset(ptr[0], 0, STRIPE_SIZE);
-	count = 1;
+	dest = page_address(sh->dev[dd_idx].page);
+	memset(dest, 0, STRIPE_SIZE);
+	count = 0;
 	for (i = disks ; i--; ) {
 		if (i == dd_idx)
 			continue;
@@ -948,8 +948,8 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
 
 		check_xor();
 	}
-	if (count != 1)
-		xor_blocks(count, STRIPE_SIZE, ptr);
+	if (count)
+		xor_blocks(count, STRIPE_SIZE, dest, ptr);
 	set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
 }
 
@@ -957,14 +957,14 @@ static void compute_parity5(struct stripe_head *sh, int method)
 {
 	raid5_conf_t *conf = sh->raid_conf;
 	int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
-	void *ptr[MAX_XOR_BLOCKS];
+	void *ptr[MAX_XOR_BLOCKS], *dest;
 	struct bio *chosen;
 
 	PRINTK("compute_parity5, stripe %llu, method %d\n",
 		(unsigned long long)sh->sector, method);
 
-	count = 1;
-	ptr[0] = page_address(sh->dev[pd_idx].page);
+	count = 0;
+	dest = page_address(sh->dev[pd_idx].page);
 	switch(method) {
 	case READ_MODIFY_WRITE:
 		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
@@ -987,7 +987,7 @@ static void compute_parity5(struct stripe_head *sh, int method)
 		}
 		break;
 	case RECONSTRUCT_WRITE:
-		memset(ptr[0], 0, STRIPE_SIZE);
+		memset(dest, 0, STRIPE_SIZE);
 		for (i= disks; i-- ;)
 			if (i!=pd_idx && sh->dev[i].towrite) {
 				chosen = sh->dev[i].towrite;
@@ -1003,9 +1003,9 @@ static void compute_parity5(struct stripe_head *sh, int method)
 	case CHECK_PARITY:
 		break;
 	}
-	if (count>1) {
-		xor_blocks(count, STRIPE_SIZE, ptr);
-		count = 1;
+	if (count) {
+		xor_blocks(count, STRIPE_SIZE, dest, ptr);
+		count = 0;
 	}
 	
 	for (i = disks; i--;)
@@ -1037,9 +1037,9 @@ static void compute_parity5(struct stripe_head *sh, int method)
 				check_xor();
 			}
 	}
-	if (count != 1)
-		xor_blocks(count, STRIPE_SIZE, ptr);
-	
+	if (count)
+		xor_blocks(count, STRIPE_SIZE, dest, ptr);
+
 	if (method != CHECK_PARITY) {
 		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
 		set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags);
@@ -1132,7 +1132,7 @@ static void compute_parity6(struct stripe_head *sh, int method)
 static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
 {
 	int i, count, disks = sh->disks;
-	void *ptr[MAX_XOR_BLOCKS], *p;
+	void *ptr[MAX_XOR_BLOCKS], *dest, *p;
 	int pd_idx = sh->pd_idx;
 	int qd_idx = raid6_next_disk(pd_idx, disks);
 
@@ -1143,9 +1143,9 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
 		/* We're actually computing the Q drive */
 		compute_parity6(sh, UPDATE_PARITY);
 	} else {
-		ptr[0] = page_address(sh->dev[dd_idx].page);
-		if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
-		count = 1;
+		dest = page_address(sh->dev[dd_idx].page);
+		if (!nozero) memset(dest, 0, STRIPE_SIZE);
+		count = 0;
 		for (i = disks ; i--; ) {
 			if (i == dd_idx || i == qd_idx)
 				continue;
@@ -1159,8 +1159,8 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
 
 			check_xor();
 		}
-		if (count != 1)
-			xor_blocks(count, STRIPE_SIZE, ptr);
+		if (count)
+			xor_blocks(count, STRIPE_SIZE, dest, ptr);
 		if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
 		else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
 	}
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
new file mode 100644
index 000000000000..ff1255079fa1
--- /dev/null
+++ b/include/linux/async_tx.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright © 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#ifndef _ASYNC_TX_H_
+#define _ASYNC_TX_H_
+#include <linux/dmaengine.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+
+/**
+ * dma_chan_ref - object used to manage dma channels received from the
+ *   dmaengine core.
+ * @chan - the channel being tracked
+ * @node - node for the channel to be placed on async_tx_master_list
+ * @rcu - for list_del_rcu
+ * @count - number of times this channel is listed in the pool
+ *	(for channels with multiple capabiities)
+ */
+struct dma_chan_ref {
+	struct dma_chan *chan;
+	struct list_head node;
+	struct rcu_head rcu;
+	atomic_t count;
+};
+
+/**
+ * async_tx_flags - modifiers for the async_* calls
+ * @ASYNC_TX_XOR_ZERO_DST: this flag must be used for xor operations where the
+ * the destination address is not a source.  The asynchronous case handles this
+ * implicitly, the synchronous case needs to zero the destination block.
+ * @ASYNC_TX_XOR_DROP_DST: this flag must be used if the destination address is
+ * also one of the source addresses.  In the synchronous case the destination
+ * address is an implied source, whereas the asynchronous case it must be listed
+ * as a source.  The destination address must be the first address in the source
+ * array.
+ * @ASYNC_TX_ASSUME_COHERENT: skip cache maintenance operations
+ * @ASYNC_TX_ACK: immediately ack the descriptor, precludes setting up a
+ * dependency chain
+ * @ASYNC_TX_DEP_ACK: ack the dependency descriptor.  Useful for chaining.
+ * @ASYNC_TX_KMAP_SRC: if the transaction is to be performed synchronously
+ * take an atomic mapping (KM_USER0) on the source page(s)
+ * @ASYNC_TX_KMAP_DST: if the transaction is to be performed synchronously
+ * take an atomic mapping (KM_USER0) on the dest page(s)
+ */
+enum async_tx_flags {
+	ASYNC_TX_XOR_ZERO_DST	 = (1 << 0),
+	ASYNC_TX_XOR_DROP_DST	 = (1 << 1),
+	ASYNC_TX_ASSUME_COHERENT = (1 << 2),
+	ASYNC_TX_ACK		 = (1 << 3),
+	ASYNC_TX_DEP_ACK	 = (1 << 4),
+	ASYNC_TX_KMAP_SRC	 = (1 << 5),
+	ASYNC_TX_KMAP_DST	 = (1 << 6),
+};
+
+#ifdef CONFIG_DMA_ENGINE
+void async_tx_issue_pending_all(void);
+enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx);
+void async_tx_run_dependencies(struct dma_async_tx_descriptor *tx);
+struct dma_chan *
+async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
+	enum dma_transaction_type tx_type);
+#else
+static inline void async_tx_issue_pending_all(void)
+{
+	do { } while (0);
+}
+
+static inline enum dma_status
+dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
+{
+	return DMA_SUCCESS;
+}
+
+static inline void
+async_tx_run_dependencies(struct dma_async_tx_descriptor *tx,
+	struct dma_chan *host_chan)
+{
+	do { } while (0);
+}
+
+static inline struct dma_chan *
+async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
+	enum dma_transaction_type tx_type)
+{
+	return NULL;
+}
+#endif
+
+/**
+ * async_tx_sync_epilog - actions to take if an operation is run synchronously
+ * @flags: async_tx flags
+ * @depend_tx: transaction depends on depend_tx
+ * @cb_fn: function to call when the transaction completes
+ * @cb_fn_param: parameter to pass to the callback routine
+ */
+static inline void
+async_tx_sync_epilog(unsigned long flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_fn_param)
+{
+	if (cb_fn)
+		cb_fn(cb_fn_param);
+
+	if (depend_tx && (flags & ASYNC_TX_DEP_ACK))
+		async_tx_ack(depend_tx);
+}
+
+void
+async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
+	enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_fn_param);
+
+struct dma_async_tx_descriptor *
+async_xor(struct page *dest, struct page **src_list, unsigned int offset,
+	int src_cnt, size_t len, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_fn_param);
+
+struct dma_async_tx_descriptor *
+async_xor_zero_sum(struct page *dest, struct page **src_list,
+	unsigned int offset, int src_cnt, size_t len,
+	u32 *result, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_fn_param);
+
+struct dma_async_tx_descriptor *
+async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
+	unsigned int src_offset, size_t len, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_fn_param);
+
+struct dma_async_tx_descriptor *
+async_memset(struct page *dest, int val, unsigned int offset,
+	size_t len, enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_fn_param);
+
+struct dma_async_tx_descriptor *
+async_trigger_callback(enum async_tx_flags flags,
+	struct dma_async_tx_descriptor *depend_tx,
+	dma_async_tx_callback cb_fn, void *cb_fn_param);
+#endif /* _ASYNC_TX_H_ */
diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h
index 7d6c20b654fa..3e120587eada 100644
--- a/include/linux/raid/xor.h
+++ b/include/linux/raid/xor.h
@@ -3,9 +3,10 @@
 
 #include <linux/raid/md.h>
 
-#define MAX_XOR_BLOCKS 5
+#define MAX_XOR_BLOCKS 4
 
-extern void xor_blocks(unsigned int count, unsigned int bytes, void **ptr);
+extern void xor_blocks(unsigned int count, unsigned int bytes,
+	void *dest, void **srcs);
 
 struct xor_block_template {
         struct xor_block_template *next;
-- 
cgit v1.3-8-gc7d7


From a445685647e825c713175d180ffc8dd54d90589b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 9 Jul 2007 11:56:43 -0700
Subject: raid5: refactor handle_stripe5 and handle_stripe6 (v3)

handle_stripe5 and handle_stripe6 have very deep logic paths handling the
various states of a stripe_head.  By introducing the 'stripe_head_state'
and 'r6_state' objects, large portions of the logic can be moved to
sub-routines.

'struct stripe_head_state' consumes all of the automatic variables that previously
stood alone in handle_stripe5,6.  'struct r6_state' contains the handle_stripe6
specific variables like p_failed and q_failed.

One of the nice side effects of the 'stripe_head_state' change is that it
allows for further reductions in code duplication between raid5 and raid6.
The following new routines are shared between raid5 and raid6:

	handle_completed_write_requests
	handle_requests_to_failed_array
	handle_stripe_expansion

Changes:
* v2: fixed 'conf->raid_disk-1' for the raid6 'handle_stripe_expansion' path
* v3: removed the unused 'dirty' field from struct stripe_head_state
* v3: coalesced open coded bi_end_io routines into return_io()

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-By: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c         | 1526 +++++++++++++++++++++-----------------------
 include/linux/raid/raid5.h |   16 +
 2 files changed, 756 insertions(+), 786 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4f51dfa8e487..38232fa111a4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -104,6 +104,23 @@ static inline int raid6_next_disk(int disk, int raid_disks)
 	disk++;
 	return (disk < raid_disks) ? disk : 0;
 }
+
+static void return_io(struct bio *return_bi)
+{
+	struct bio *bi = return_bi;
+	while (bi) {
+		int bytes = bi->bi_size;
+
+		return_bi = bi->bi_next;
+		bi->bi_next = NULL;
+		bi->bi_size = 0;
+		bi->bi_end_io(bi, bytes,
+			      test_bit(BIO_UPTODATE, &bi->bi_flags)
+			        ? 0 : -EIO);
+		bi = return_bi;
+	}
+}
+
 static void print_raid5_conf (raid5_conf_t *conf);
 
 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
@@ -1326,6 +1343,608 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
 	return pd_idx;
 }
 
+static void
+handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
+				struct stripe_head_state *s, int disks,
+				struct bio **return_bi)
+{
+	int i;
+	for (i = disks; i--; ) {
+		struct bio *bi;
+		int bitmap_end = 0;
+
+		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
+			mdk_rdev_t *rdev;
+			rcu_read_lock();
+			rdev = rcu_dereference(conf->disks[i].rdev);
+			if (rdev && test_bit(In_sync, &rdev->flags))
+				/* multiple read failures in one stripe */
+				md_error(conf->mddev, rdev);
+			rcu_read_unlock();
+		}
+		spin_lock_irq(&conf->device_lock);
+		/* fail all writes first */
+		bi = sh->dev[i].towrite;
+		sh->dev[i].towrite = NULL;
+		if (bi) {
+			s->to_write--;
+			bitmap_end = 1;
+		}
+
+		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+			wake_up(&conf->wait_for_overlap);
+
+		while (bi && bi->bi_sector <
+			sh->dev[i].sector + STRIPE_SECTORS) {
+			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
+			clear_bit(BIO_UPTODATE, &bi->bi_flags);
+			if (--bi->bi_phys_segments == 0) {
+				md_write_end(conf->mddev);
+				bi->bi_next = *return_bi;
+				*return_bi = bi;
+			}
+			bi = nextbi;
+		}
+		/* and fail all 'written' */
+		bi = sh->dev[i].written;
+		sh->dev[i].written = NULL;
+		if (bi) bitmap_end = 1;
+		while (bi && bi->bi_sector <
+		       sh->dev[i].sector + STRIPE_SECTORS) {
+			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
+			clear_bit(BIO_UPTODATE, &bi->bi_flags);
+			if (--bi->bi_phys_segments == 0) {
+				md_write_end(conf->mddev);
+				bi->bi_next = *return_bi;
+				*return_bi = bi;
+			}
+			bi = bi2;
+		}
+
+		/* fail any reads if this device is non-operational */
+		if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
+		    test_bit(R5_ReadError, &sh->dev[i].flags)) {
+			bi = sh->dev[i].toread;
+			sh->dev[i].toread = NULL;
+			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+				wake_up(&conf->wait_for_overlap);
+			if (bi) s->to_read--;
+			while (bi && bi->bi_sector <
+			       sh->dev[i].sector + STRIPE_SECTORS) {
+				struct bio *nextbi =
+					r5_next_bio(bi, sh->dev[i].sector);
+				clear_bit(BIO_UPTODATE, &bi->bi_flags);
+				if (--bi->bi_phys_segments == 0) {
+					bi->bi_next = *return_bi;
+					*return_bi = bi;
+				}
+				bi = nextbi;
+			}
+		}
+		spin_unlock_irq(&conf->device_lock);
+		if (bitmap_end)
+			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+					STRIPE_SECTORS, 0, 0);
+	}
+
+}
+
+static void handle_issuing_new_read_requests5(struct stripe_head *sh,
+			struct stripe_head_state *s, int disks)
+{
+	int i;
+	for (i = disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		if (!test_bit(R5_LOCKED, &dev->flags) &&
+		    !test_bit(R5_UPTODATE, &dev->flags) &&
+		    (dev->toread ||
+		     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
+		     s->syncing || s->expanding ||
+		     (s->failed && (sh->dev[s->failed_num].toread ||
+			(sh->dev[s->failed_num].towrite &&
+			!test_bit(R5_OVERWRITE, &sh->dev[s->failed_num].flags))
+		      )))) {
+			/* we would like to get this block, possibly
+			 * by computing it, but we might not be able to
+			 */
+			if (s->uptodate == disks-1) {
+				PRINTK("Computing block %d\n", i);
+				compute_block(sh, i);
+				s->uptodate++;
+			} else if (test_bit(R5_Insync, &dev->flags)) {
+				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantread, &dev->flags);
+				s->locked++;
+				PRINTK("Reading block %d (sync=%d)\n",
+					i, s->syncing);
+			}
+		}
+	}
+	set_bit(STRIPE_HANDLE, &sh->state);
+}
+
+static void handle_issuing_new_read_requests6(struct stripe_head *sh,
+			struct stripe_head_state *s, struct r6_state *r6s,
+			int disks)
+{
+	int i;
+	for (i = disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		if (!test_bit(R5_LOCKED, &dev->flags) &&
+		    !test_bit(R5_UPTODATE, &dev->flags) &&
+		    (dev->toread || (dev->towrite &&
+		     !test_bit(R5_OVERWRITE, &dev->flags)) ||
+		     s->syncing || s->expanding ||
+		     (s->failed >= 1 &&
+		      (sh->dev[r6s->failed_num[0]].toread ||
+		       s->to_write)) ||
+		     (s->failed >= 2 &&
+		      (sh->dev[r6s->failed_num[1]].toread ||
+		       s->to_write)))) {
+			/* we would like to get this block, possibly
+			 * by computing it, but we might not be able to
+			 */
+			if (s->uptodate == disks-1) {
+				PRINTK("Computing stripe %llu block %d\n",
+				       (unsigned long long)sh->sector, i);
+				compute_block_1(sh, i, 0);
+				s->uptodate++;
+			} else if ( s->uptodate == disks-2 && s->failed >= 2 ) {
+				/* Computing 2-failure is *very* expensive; only
+				 * do it if failed >= 2
+				 */
+				int other;
+				for (other = disks; other--; ) {
+					if (other == i)
+						continue;
+					if (!test_bit(R5_UPTODATE,
+					      &sh->dev[other].flags))
+						break;
+				}
+				BUG_ON(other < 0);
+				PRINTK("Computing stripe %llu blocks %d,%d\n",
+				       (unsigned long long)sh->sector,
+				       i, other);
+				compute_block_2(sh, i, other);
+				s->uptodate += 2;
+			} else if (test_bit(R5_Insync, &dev->flags)) {
+				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantread, &dev->flags);
+				s->locked++;
+				PRINTK("Reading block %d (sync=%d)\n",
+					i, s->syncing);
+			}
+		}
+	}
+	set_bit(STRIPE_HANDLE, &sh->state);
+}
+
+
+/* handle_completed_write_requests
+ * any written block on an uptodate or failed drive can be returned.
+ * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
+ * never LOCKED, so we don't need to test 'failed' directly.
+ */
+static void handle_completed_write_requests(raid5_conf_t *conf,
+	struct stripe_head *sh, int disks, struct bio **return_bi)
+{
+	int i;
+	struct r5dev *dev;
+
+	for (i = disks; i--; )
+		if (sh->dev[i].written) {
+			dev = &sh->dev[i];
+			if (!test_bit(R5_LOCKED, &dev->flags) &&
+				test_bit(R5_UPTODATE, &dev->flags)) {
+				/* We can return any write requests */
+				struct bio *wbi, *wbi2;
+				int bitmap_end = 0;
+				PRINTK("Return write for disc %d\n", i);
+				spin_lock_irq(&conf->device_lock);
+				wbi = dev->written;
+				dev->written = NULL;
+				while (wbi && wbi->bi_sector <
+					dev->sector + STRIPE_SECTORS) {
+					wbi2 = r5_next_bio(wbi, dev->sector);
+					if (--wbi->bi_phys_segments == 0) {
+						md_write_end(conf->mddev);
+						wbi->bi_next = *return_bi;
+						*return_bi = wbi;
+					}
+					wbi = wbi2;
+				}
+				if (dev->towrite == NULL)
+					bitmap_end = 1;
+				spin_unlock_irq(&conf->device_lock);
+				if (bitmap_end)
+					bitmap_endwrite(conf->mddev->bitmap,
+							sh->sector,
+							STRIPE_SECTORS,
+					 !test_bit(STRIPE_DEGRADED, &sh->state),
+							0);
+			}
+		}
+}
+
+static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
+		struct stripe_head *sh,	struct stripe_head_state *s, int disks)
+{
+	int rmw = 0, rcw = 0, i;
+	for (i = disks; i--; ) {
+		/* would I have to read this buffer for read_modify_write */
+		struct r5dev *dev = &sh->dev[i];
+		if ((dev->towrite || i == sh->pd_idx) &&
+		    !test_bit(R5_LOCKED, &dev->flags) &&
+		    !test_bit(R5_UPTODATE, &dev->flags)) {
+			if (test_bit(R5_Insync, &dev->flags))
+				rmw++;
+			else
+				rmw += 2*disks;  /* cannot read it */
+		}
+		/* Would I have to read this buffer for reconstruct_write */
+		if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
+		    !test_bit(R5_LOCKED, &dev->flags) &&
+		    !test_bit(R5_UPTODATE, &dev->flags)) {
+			if (test_bit(R5_Insync, &dev->flags))
+				rcw++;
+			else
+				rcw += 2*disks;
+		}
+	}
+	PRINTK("for sector %llu, rmw=%d rcw=%d\n",
+		(unsigned long long)sh->sector, rmw, rcw);
+	set_bit(STRIPE_HANDLE, &sh->state);
+	if (rmw < rcw && rmw > 0)
+		/* prefer read-modify-write, but need to get some data */
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			if ((dev->towrite || i == sh->pd_idx) &&
+			    !test_bit(R5_LOCKED, &dev->flags) &&
+			    !test_bit(R5_UPTODATE, &dev->flags) &&
+			    test_bit(R5_Insync, &dev->flags)) {
+				if (
+				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+					PRINTK("Read_old block "
+						"%d for r-m-w\n", i);
+					set_bit(R5_LOCKED, &dev->flags);
+					set_bit(R5_Wantread, &dev->flags);
+					s->locked++;
+				} else {
+					set_bit(STRIPE_DELAYED, &sh->state);
+					set_bit(STRIPE_HANDLE, &sh->state);
+				}
+			}
+		}
+	if (rcw <= rmw && rcw > 0)
+		/* want reconstruct write, but need to get some data */
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+			    i != sh->pd_idx &&
+			    !test_bit(R5_LOCKED, &dev->flags) &&
+			    !test_bit(R5_UPTODATE, &dev->flags) &&
+			    test_bit(R5_Insync, &dev->flags)) {
+				if (
+				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+					PRINTK("Read_old block "
+						"%d for Reconstruct\n", i);
+					set_bit(R5_LOCKED, &dev->flags);
+					set_bit(R5_Wantread, &dev->flags);
+					s->locked++;
+				} else {
+					set_bit(STRIPE_DELAYED, &sh->state);
+					set_bit(STRIPE_HANDLE, &sh->state);
+				}
+			}
+		}
+	/* now if nothing is locked, and if we have enough data,
+	 * we can start a write request
+	 */
+	if (s->locked == 0 && (rcw == 0 || rmw == 0) &&
+	    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
+		PRINTK("Computing parity...\n");
+		compute_parity5(sh, rcw == 0 ?
+			RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
+		/* now every locked buffer is ready to be written */
+		for (i = disks; i--; )
+			if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
+				PRINTK("Writing block %d\n", i);
+				s->locked++;
+				set_bit(R5_Wantwrite, &sh->dev[i].flags);
+				if (!test_bit(R5_Insync, &sh->dev[i].flags)
+				    || (i == sh->pd_idx && s->failed == 0))
+					set_bit(STRIPE_INSYNC, &sh->state);
+			}
+		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+			atomic_dec(&conf->preread_active_stripes);
+			if (atomic_read(&conf->preread_active_stripes) <
+			    IO_THRESHOLD)
+				md_wakeup_thread(conf->mddev->thread);
+		}
+	}
+}
+
+static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
+		struct stripe_head *sh,	struct stripe_head_state *s,
+		struct r6_state *r6s, int disks)
+{
+	int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
+	int qd_idx = r6s->qd_idx;
+	for (i = disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		/* Would I have to read this buffer for reconstruct_write */
+		if (!test_bit(R5_OVERWRITE, &dev->flags)
+		    && i != pd_idx && i != qd_idx
+		    && (!test_bit(R5_LOCKED, &dev->flags)
+			    ) &&
+		    !test_bit(R5_UPTODATE, &dev->flags)) {
+			if (test_bit(R5_Insync, &dev->flags)) rcw++;
+			else {
+				PRINTK("raid6: must_compute: "
+					"disk %d flags=%#lx\n", i, dev->flags);
+				must_compute++;
+			}
+		}
+	}
+	PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
+	       (unsigned long long)sh->sector, rcw, must_compute);
+	set_bit(STRIPE_HANDLE, &sh->state);
+
+	if (rcw > 0)
+		/* want reconstruct write, but need to get some data */
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			if (!test_bit(R5_OVERWRITE, &dev->flags)
+			    && !(s->failed == 0 && (i == pd_idx || i == qd_idx))
+			    && !test_bit(R5_LOCKED, &dev->flags) &&
+			    !test_bit(R5_UPTODATE, &dev->flags) &&
+			    test_bit(R5_Insync, &dev->flags)) {
+				if (
+				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+					PRINTK("Read_old stripe %llu "
+						"block %d for Reconstruct\n",
+					     (unsigned long long)sh->sector, i);
+					set_bit(R5_LOCKED, &dev->flags);
+					set_bit(R5_Wantread, &dev->flags);
+					s->locked++;
+				} else {
+					PRINTK("Request delayed stripe %llu "
+						"block %d for Reconstruct\n",
+					     (unsigned long long)sh->sector, i);
+					set_bit(STRIPE_DELAYED, &sh->state);
+					set_bit(STRIPE_HANDLE, &sh->state);
+				}
+			}
+		}
+	/* now if nothing is locked, and if we have enough data, we can start a
+	 * write request
+	 */
+	if (s->locked == 0 && rcw == 0 &&
+	    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
+		if (must_compute > 0) {
+			/* We have failed blocks and need to compute them */
+			switch (s->failed) {
+			case 0:
+				BUG();
+			case 1:
+				compute_block_1(sh, r6s->failed_num[0], 0);
+				break;
+			case 2:
+				compute_block_2(sh, r6s->failed_num[0],
+						r6s->failed_num[1]);
+				break;
+			default: /* This request should have been failed? */
+				BUG();
+			}
+		}
+
+		PRINTK("Computing parity for stripe %llu\n",
+			(unsigned long long)sh->sector);
+		compute_parity6(sh, RECONSTRUCT_WRITE);
+		/* now every locked buffer is ready to be written */
+		for (i = disks; i--; )
+			if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
+				PRINTK("Writing stripe %llu block %d\n",
+				       (unsigned long long)sh->sector, i);
+				s->locked++;
+				set_bit(R5_Wantwrite, &sh->dev[i].flags);
+			}
+		/* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
+		set_bit(STRIPE_INSYNC, &sh->state);
+
+		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+			atomic_dec(&conf->preread_active_stripes);
+			if (atomic_read(&conf->preread_active_stripes) <
+			    IO_THRESHOLD)
+				md_wakeup_thread(conf->mddev->thread);
+		}
+	}
+}
+
+static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
+				struct stripe_head_state *s, int disks)
+{
+	set_bit(STRIPE_HANDLE, &sh->state);
+	if (s->failed == 0) {
+		BUG_ON(s->uptodate != disks);
+		compute_parity5(sh, CHECK_PARITY);
+		s->uptodate--;
+		if (page_is_zero(sh->dev[sh->pd_idx].page)) {
+			/* parity is correct (on disc, not in buffer any more)
+			 */
+			set_bit(STRIPE_INSYNC, &sh->state);
+		} else {
+			conf->mddev->resync_mismatches += STRIPE_SECTORS;
+			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+				/* don't try to repair!! */
+				set_bit(STRIPE_INSYNC, &sh->state);
+			else {
+				compute_block(sh, sh->pd_idx);
+				s->uptodate++;
+			}
+		}
+	}
+	if (!test_bit(STRIPE_INSYNC, &sh->state)) {
+		struct r5dev *dev;
+		/* either failed parity check, or recovery is happening */
+		if (s->failed == 0)
+			s->failed_num = sh->pd_idx;
+		dev = &sh->dev[s->failed_num];
+		BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
+		BUG_ON(s->uptodate != disks);
+
+		set_bit(R5_LOCKED, &dev->flags);
+		set_bit(R5_Wantwrite, &dev->flags);
+		clear_bit(STRIPE_DEGRADED, &sh->state);
+		s->locked++;
+		set_bit(STRIPE_INSYNC, &sh->state);
+	}
+}
+
+
+static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
+				struct stripe_head_state *s,
+				struct r6_state *r6s, struct page *tmp_page,
+				int disks)
+{
+	int update_p = 0, update_q = 0;
+	struct r5dev *dev;
+	int pd_idx = sh->pd_idx;
+	int qd_idx = r6s->qd_idx;
+
+	set_bit(STRIPE_HANDLE, &sh->state);
+
+	BUG_ON(s->failed > 2);
+	BUG_ON(s->uptodate < disks);
+	/* Want to check and possibly repair P and Q.
+	 * However there could be one 'failed' device, in which
+	 * case we can only check one of them, possibly using the
+	 * other to generate missing data
+	 */
+
+	/* If !tmp_page, we cannot do the calculations,
+	 * but as we have set STRIPE_HANDLE, we will soon be called
+	 * by stripe_handle with a tmp_page - just wait until then.
+	 */
+	if (tmp_page) {
+		if (s->failed == r6s->q_failed) {
+			/* The only possible failed device holds 'Q', so it
+			 * makes sense to check P (If anything else were failed,
+			 * we would have used P to recreate it).
+			 */
+			compute_block_1(sh, pd_idx, 1);
+			if (!page_is_zero(sh->dev[pd_idx].page)) {
+				compute_block_1(sh, pd_idx, 0);
+				update_p = 1;
+			}
+		}
+		if (!r6s->q_failed && s->failed < 2) {
+			/* q is not failed, and we didn't use it to generate
+			 * anything, so it makes sense to check it
+			 */
+			memcpy(page_address(tmp_page),
+			       page_address(sh->dev[qd_idx].page),
+			       STRIPE_SIZE);
+			compute_parity6(sh, UPDATE_PARITY);
+			if (memcmp(page_address(tmp_page),
+				   page_address(sh->dev[qd_idx].page),
+				   STRIPE_SIZE) != 0) {
+				clear_bit(STRIPE_INSYNC, &sh->state);
+				update_q = 1;
+			}
+		}
+		if (update_p || update_q) {
+			conf->mddev->resync_mismatches += STRIPE_SECTORS;
+			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+				/* don't try to repair!! */
+				update_p = update_q = 0;
+		}
+
+		/* now write out any block on a failed drive,
+		 * or P or Q if they need it
+		 */
+
+		if (s->failed == 2) {
+			dev = &sh->dev[r6s->failed_num[1]];
+			s->locked++;
+			set_bit(R5_LOCKED, &dev->flags);
+			set_bit(R5_Wantwrite, &dev->flags);
+		}
+		if (s->failed >= 1) {
+			dev = &sh->dev[r6s->failed_num[0]];
+			s->locked++;
+			set_bit(R5_LOCKED, &dev->flags);
+			set_bit(R5_Wantwrite, &dev->flags);
+		}
+
+		if (update_p) {
+			dev = &sh->dev[pd_idx];
+			s->locked++;
+			set_bit(R5_LOCKED, &dev->flags);
+			set_bit(R5_Wantwrite, &dev->flags);
+		}
+		if (update_q) {
+			dev = &sh->dev[qd_idx];
+			s->locked++;
+			set_bit(R5_LOCKED, &dev->flags);
+			set_bit(R5_Wantwrite, &dev->flags);
+		}
+		clear_bit(STRIPE_DEGRADED, &sh->state);
+
+		set_bit(STRIPE_INSYNC, &sh->state);
+	}
+}
+
+static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
+				struct r6_state *r6s)
+{
+	int i;
+
+	/* We have read all the blocks in this stripe and now we need to
+	 * copy some of them into a target stripe for expand.
+	 */
+	clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+	for (i = 0; i < sh->disks; i++)
+		if (i != sh->pd_idx && (r6s && i != r6s->qd_idx)) {
+			int dd_idx, pd_idx, j;
+			struct stripe_head *sh2;
+
+			sector_t bn = compute_blocknr(sh, i);
+			sector_t s = raid5_compute_sector(bn, conf->raid_disks,
+						conf->raid_disks -
+						conf->max_degraded, &dd_idx,
+						&pd_idx, conf);
+			sh2 = get_active_stripe(conf, s, conf->raid_disks,
+						pd_idx, 1);
+			if (sh2 == NULL)
+				/* so far only the early blocks of this stripe
+				 * have been requested.  When later blocks
+				 * get requested, we will try again
+				 */
+				continue;
+			if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
+			   test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
+				/* must have already done this block */
+				release_stripe(sh2);
+				continue;
+			}
+			memcpy(page_address(sh2->dev[dd_idx].page),
+			       page_address(sh->dev[i].page),
+			       STRIPE_SIZE);
+			set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
+			set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
+			for (j = 0; j < conf->raid_disks; j++)
+				if (j != sh2->pd_idx &&
+				    (r6s && j != r6s->qd_idx) &&
+				    !test_bit(R5_Expanded, &sh2->dev[j].flags))
+					break;
+			if (j == conf->raid_disks) {
+				set_bit(STRIPE_EXPAND_READY, &sh2->state);
+				set_bit(STRIPE_HANDLE, &sh2->state);
+			}
+			release_stripe(sh2);
+		}
+}
 
 /*
  * handle_stripe - do things to a stripe.
@@ -1344,20 +1963,16 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
  * get BH_Lock set before the stripe lock is released.
  *
  */
- 
+
 static void handle_stripe5(struct stripe_head *sh)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = sh->disks;
-	struct bio *return_bi= NULL;
-	struct bio *bi;
-	int i;
-	int syncing, expanding, expanded;
-	int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
-	int non_overwrite = 0;
-	int failed_num=0;
+	int disks = sh->disks, i;
+	struct bio *return_bi = NULL;
+	struct stripe_head_state s;
 	struct r5dev *dev;
 
+	memset(&s, 0, sizeof(s));
 	PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n",
 		(unsigned long long)sh->sector, atomic_read(&sh->count),
 		sh->pd_idx);
@@ -1366,15 +1981,15 @@ static void handle_stripe5(struct stripe_head *sh)
 	clear_bit(STRIPE_HANDLE, &sh->state);
 	clear_bit(STRIPE_DELAYED, &sh->state);
 
-	syncing = test_bit(STRIPE_SYNCING, &sh->state);
-	expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-	expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
+	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
 	/* Now to look around and see what can be done */
 
 	rcu_read_lock();
 	for (i=disks; i--; ) {
 		mdk_rdev_t *rdev;
-		dev = &sh->dev[i];
+		struct r5dev *dev = &sh->dev[i];
 		clear_bit(R5_Insync, &dev->flags);
 
 		PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
@@ -1403,17 +2018,18 @@ static void handle_stripe5(struct stripe_head *sh)
 		}
 
 		/* now count some things */
-		if (test_bit(R5_LOCKED, &dev->flags)) locked++;
-		if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
+		if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
+		if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
 
-		
-		if (dev->toread) to_read++;
+		if (dev->toread)
+			s.to_read++;
 		if (dev->towrite) {
-			to_write++;
+			s.to_write++;
 			if (!test_bit(R5_OVERWRITE, &dev->flags))
-				non_overwrite++;
+				s.non_overwrite++;
 		}
-		if (dev->written) written++;
+		if (dev->written)
+			s.written++;
 		rdev = rcu_dereference(conf->disks[i].rdev);
 		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
 			/* The ReadError flag will just be confusing now */
@@ -1422,306 +2038,59 @@ static void handle_stripe5(struct stripe_head *sh)
 		}
 		if (!rdev || !test_bit(In_sync, &rdev->flags)
 		    || test_bit(R5_ReadError, &dev->flags)) {
-			failed++;
-			failed_num = i;
+			s.failed++;
+			s.failed_num = i;
 		} else
 			set_bit(R5_Insync, &dev->flags);
 	}
 	rcu_read_unlock();
 	PRINTK("locked=%d uptodate=%d to_read=%d"
 		" to_write=%d failed=%d failed_num=%d\n",
-		locked, uptodate, to_read, to_write, failed, failed_num);
+		s.locked, s.uptodate, s.to_read, s.to_write,
+		s.failed, s.failed_num);
 	/* check if the array has lost two devices and, if so, some requests might
 	 * need to be failed
 	 */
-	if (failed > 1 && to_read+to_write+written) {
-		for (i=disks; i--; ) {
-			int bitmap_end = 0;
-
-			if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-				mdk_rdev_t *rdev;
-				rcu_read_lock();
-				rdev = rcu_dereference(conf->disks[i].rdev);
-				if (rdev && test_bit(In_sync, &rdev->flags))
-					/* multiple read failures in one stripe */
-					md_error(conf->mddev, rdev);
-				rcu_read_unlock();
-			}
-
-			spin_lock_irq(&conf->device_lock);
-			/* fail all writes first */
-			bi = sh->dev[i].towrite;
-			sh->dev[i].towrite = NULL;
-			if (bi) { to_write--; bitmap_end = 1; }
-
-			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-				wake_up(&conf->wait_for_overlap);
-
-			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
-				struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
-				clear_bit(BIO_UPTODATE, &bi->bi_flags);
-				if (--bi->bi_phys_segments == 0) {
-					md_write_end(conf->mddev);
-					bi->bi_next = return_bi;
-					return_bi = bi;
-				}
-				bi = nextbi;
-			}
-			/* and fail all 'written' */
-			bi = sh->dev[i].written;
-			sh->dev[i].written = NULL;
-			if (bi) bitmap_end = 1;
-			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
-				struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
-				clear_bit(BIO_UPTODATE, &bi->bi_flags);
-				if (--bi->bi_phys_segments == 0) {
-					md_write_end(conf->mddev);
-					bi->bi_next = return_bi;
-					return_bi = bi;
-				}
-				bi = bi2;
-			}
-
-			/* fail any reads if this device is non-operational */
-			if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
-			    test_bit(R5_ReadError, &sh->dev[i].flags)) {
-				bi = sh->dev[i].toread;
-				sh->dev[i].toread = NULL;
-				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-					wake_up(&conf->wait_for_overlap);
-				if (bi) to_read--;
-				while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
-					struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
-					clear_bit(BIO_UPTODATE, &bi->bi_flags);
-					if (--bi->bi_phys_segments == 0) {
-						bi->bi_next = return_bi;
-						return_bi = bi;
-					}
-					bi = nextbi;
-				}
-			}
-			spin_unlock_irq(&conf->device_lock);
-			if (bitmap_end)
-				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
-						STRIPE_SECTORS, 0, 0);
-		}
-	}
-	if (failed > 1 && syncing) {
+	if (s.failed > 1 && s.to_read+s.to_write+s.written)
+		handle_requests_to_failed_array(conf, sh, &s, disks,
+						&return_bi);
+	if (s.failed > 1 && s.syncing) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
 		clear_bit(STRIPE_SYNCING, &sh->state);
-		syncing = 0;
+		s.syncing = 0;
 	}
 
 	/* might be able to return some write requests if the parity block
 	 * is safe, or on a failed drive
 	 */
 	dev = &sh->dev[sh->pd_idx];
-	if ( written &&
-	     ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) &&
-		test_bit(R5_UPTODATE, &dev->flags))
-	       || (failed == 1 && failed_num == sh->pd_idx))
-	    ) {
-	    /* any written block on an uptodate or failed drive can be returned.
-	     * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 
-	     * never LOCKED, so we don't need to test 'failed' directly.
-	     */
-	    for (i=disks; i--; )
-		if (sh->dev[i].written) {
-		    dev = &sh->dev[i];
-		    if (!test_bit(R5_LOCKED, &dev->flags) &&
-			 test_bit(R5_UPTODATE, &dev->flags) ) {
-			/* We can return any write requests */
-			    struct bio *wbi, *wbi2;
-			    int bitmap_end = 0;
-			    PRINTK("Return write for disc %d\n", i);
-			    spin_lock_irq(&conf->device_lock);
-			    wbi = dev->written;
-			    dev->written = NULL;
-			    while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
-				    wbi2 = r5_next_bio(wbi, dev->sector);
-				    if (--wbi->bi_phys_segments == 0) {
-					    md_write_end(conf->mddev);
-					    wbi->bi_next = return_bi;
-					    return_bi = wbi;
-				    }
-				    wbi = wbi2;
-			    }
-			    if (dev->towrite == NULL)
-				    bitmap_end = 1;
-			    spin_unlock_irq(&conf->device_lock);
-			    if (bitmap_end)
-				    bitmap_endwrite(conf->mddev->bitmap, sh->sector,
-						    STRIPE_SECTORS,
-						    !test_bit(STRIPE_DEGRADED, &sh->state), 0);
-		    }
-		}
-	}
+	if ( s.written &&
+	     ((test_bit(R5_Insync, &dev->flags) &&
+	       !test_bit(R5_LOCKED, &dev->flags) &&
+	       test_bit(R5_UPTODATE, &dev->flags)) ||
+	       (s.failed == 1 && s.failed_num == sh->pd_idx)))
+		handle_completed_write_requests(conf, sh, disks, &return_bi);
 
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
 	 */
-	if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) {
-		for (i=disks; i--;) {
-			dev = &sh->dev[i];
-			if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
-			    (dev->toread ||
-			     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
-			     syncing ||
-			     expanding ||
-			     (failed && (sh->dev[failed_num].toread ||
-					 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
-				    )
-				) {
-				/* we would like to get this block, possibly
-				 * by computing it, but we might not be able to
-				 */
-				if (uptodate == disks-1) {
-					PRINTK("Computing block %d\n", i);
-					compute_block(sh, i);
-					uptodate++;
-				} else if (test_bit(R5_Insync, &dev->flags)) {
-					set_bit(R5_LOCKED, &dev->flags);
-					set_bit(R5_Wantread, &dev->flags);
-					locked++;
-					PRINTK("Reading block %d (sync=%d)\n", 
-						i, syncing);
-				}
-			}
-		}
-		set_bit(STRIPE_HANDLE, &sh->state);
-	}
+	if (s.to_read || s.non_overwrite ||
+		(s.syncing && (s.uptodate < disks)) || s.expanding)
+		handle_issuing_new_read_requests5(sh, &s, disks);
 
 	/* now to consider writing and what else, if anything should be read */
-	if (to_write) {
-		int rmw=0, rcw=0;
-		for (i=disks ; i--;) {
-			/* would I have to read this buffer for read_modify_write */
-			dev = &sh->dev[i];
-			if ((dev->towrite || i == sh->pd_idx) &&
-			    (!test_bit(R5_LOCKED, &dev->flags) 
-				    ) &&
-			    !test_bit(R5_UPTODATE, &dev->flags)) {
-				if (test_bit(R5_Insync, &dev->flags)
-/*				    && !(!mddev->insync && i == sh->pd_idx) */
-					)
-					rmw++;
-				else rmw += 2*disks;  /* cannot read it */
-			}
-			/* Would I have to read this buffer for reconstruct_write */
-			if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
-			    (!test_bit(R5_LOCKED, &dev->flags) 
-				    ) &&
-			    !test_bit(R5_UPTODATE, &dev->flags)) {
-				if (test_bit(R5_Insync, &dev->flags)) rcw++;
-				else rcw += 2*disks;
-			}
-		}
-		PRINTK("for sector %llu, rmw=%d rcw=%d\n", 
-			(unsigned long long)sh->sector, rmw, rcw);
-		set_bit(STRIPE_HANDLE, &sh->state);
-		if (rmw < rcw && rmw > 0)
-			/* prefer read-modify-write, but need to get some data */
-			for (i=disks; i--;) {
-				dev = &sh->dev[i];
-				if ((dev->towrite || i == sh->pd_idx) &&
-				    !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
-				    test_bit(R5_Insync, &dev->flags)) {
-					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-					{
-						PRINTK("Read_old block %d for r-m-w\n", i);
-						set_bit(R5_LOCKED, &dev->flags);
-						set_bit(R5_Wantread, &dev->flags);
-						locked++;
-					} else {
-						set_bit(STRIPE_DELAYED, &sh->state);
-						set_bit(STRIPE_HANDLE, &sh->state);
-					}
-				}
-			}
-		if (rcw <= rmw && rcw > 0)
-			/* want reconstruct write, but need to get some data */
-			for (i=disks; i--;) {
-				dev = &sh->dev[i];
-				if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
-				    !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
-				    test_bit(R5_Insync, &dev->flags)) {
-					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-					{
-						PRINTK("Read_old block %d for Reconstruct\n", i);
-						set_bit(R5_LOCKED, &dev->flags);
-						set_bit(R5_Wantread, &dev->flags);
-						locked++;
-					} else {
-						set_bit(STRIPE_DELAYED, &sh->state);
-						set_bit(STRIPE_HANDLE, &sh->state);
-					}
-				}
-			}
-		/* now if nothing is locked, and if we have enough data, we can start a write request */
-		if (locked == 0 && (rcw == 0 ||rmw == 0) &&
-		    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
-			PRINTK("Computing parity...\n");
-			compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
-			/* now every locked buffer is ready to be written */
-			for (i=disks; i--;)
-				if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
-					PRINTK("Writing block %d\n", i);
-					locked++;
-					set_bit(R5_Wantwrite, &sh->dev[i].flags);
-					if (!test_bit(R5_Insync, &sh->dev[i].flags)
-					    || (i==sh->pd_idx && failed == 0))
-						set_bit(STRIPE_INSYNC, &sh->state);
-				}
-			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-				atomic_dec(&conf->preread_active_stripes);
-				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
-					md_wakeup_thread(conf->mddev->thread);
-			}
-		}
-	}
+	if (s.to_write)
+		handle_issuing_new_write_requests5(conf, sh, &s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
 	 * Any reads will already have been scheduled, so we just see if enough data
 	 * is available
 	 */
-	if (syncing && locked == 0 &&
-	    !test_bit(STRIPE_INSYNC, &sh->state)) {
-		set_bit(STRIPE_HANDLE, &sh->state);
-		if (failed == 0) {
-			BUG_ON(uptodate != disks);
-			compute_parity5(sh, CHECK_PARITY);
-			uptodate--;
-			if (page_is_zero(sh->dev[sh->pd_idx].page)) {
-				/* parity is correct (on disc, not in buffer any more) */
-				set_bit(STRIPE_INSYNC, &sh->state);
-			} else {
-				conf->mddev->resync_mismatches += STRIPE_SECTORS;
-				if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
-					/* don't try to repair!! */
-					set_bit(STRIPE_INSYNC, &sh->state);
-				else {
-					compute_block(sh, sh->pd_idx);
-					uptodate++;
-				}
-			}
-		}
-		if (!test_bit(STRIPE_INSYNC, &sh->state)) {
-			/* either failed parity check, or recovery is happening */
-			if (failed==0)
-				failed_num = sh->pd_idx;
-			dev = &sh->dev[failed_num];
-			BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
-			BUG_ON(uptodate != disks);
-
-			set_bit(R5_LOCKED, &dev->flags);
-			set_bit(R5_Wantwrite, &dev->flags);
-			clear_bit(STRIPE_DEGRADED, &sh->state);
-			locked++;
-			set_bit(STRIPE_INSYNC, &sh->state);
-		}
-	}
-	if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
+	if (s.syncing && s.locked == 0 &&
+	    !test_bit(STRIPE_INSYNC, &sh->state))
+		handle_parity_checks5(conf, sh, &s, disks);
+	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
 		clear_bit(STRIPE_SYNCING, &sh->state);
 	}
@@ -1729,99 +2098,50 @@ static void handle_stripe5(struct stripe_head *sh)
 	/* If the failed drive is just a ReadError, then we might need to progress
 	 * the repair/check process
 	 */
-	if (failed == 1 && ! conf->mddev->ro &&
-	    test_bit(R5_ReadError, &sh->dev[failed_num].flags)
-	    && !test_bit(R5_LOCKED, &sh->dev[failed_num].flags)
-	    && test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)
+	if (s.failed == 1 && !conf->mddev->ro &&
+	    test_bit(R5_ReadError, &sh->dev[s.failed_num].flags)
+	    && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags)
+	    && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags)
 		) {
-		dev = &sh->dev[failed_num];
+		dev = &sh->dev[s.failed_num];
 		if (!test_bit(R5_ReWrite, &dev->flags)) {
 			set_bit(R5_Wantwrite, &dev->flags);
 			set_bit(R5_ReWrite, &dev->flags);
 			set_bit(R5_LOCKED, &dev->flags);
-			locked++;
+			s.locked++;
 		} else {
 			/* let's read it back */
 			set_bit(R5_Wantread, &dev->flags);
 			set_bit(R5_LOCKED, &dev->flags);
-			locked++;
+			s.locked++;
 		}
 	}
 
-	if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
 		/* Need to write out all blocks after computing parity */
 		sh->disks = conf->raid_disks;
 		sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
 		compute_parity5(sh, RECONSTRUCT_WRITE);
-		for (i= conf->raid_disks; i--;) {
+		for (i = conf->raid_disks; i--; ) {
 			set_bit(R5_LOCKED, &sh->dev[i].flags);
-			locked++;
+			s.locked++;
 			set_bit(R5_Wantwrite, &sh->dev[i].flags);
 		}
 		clear_bit(STRIPE_EXPANDING, &sh->state);
-	} else if (expanded) {
+	} else if (s.expanded) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
 		atomic_dec(&conf->reshape_stripes);
 		wake_up(&conf->wait_for_overlap);
 		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
 	}
 
-	if (expanding && locked == 0) {
-		/* We have read all the blocks in this stripe and now we need to
-		 * copy some of them into a target stripe for expand.
-		 */
-		clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-		for (i=0; i< sh->disks; i++)
-			if (i != sh->pd_idx) {
-				int dd_idx, pd_idx, j;
-				struct stripe_head *sh2;
-
-				sector_t bn = compute_blocknr(sh, i);
-				sector_t s = raid5_compute_sector(bn, conf->raid_disks,
-								  conf->raid_disks-1,
-								  &dd_idx, &pd_idx, conf);
-				sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1);
-				if (sh2 == NULL)
-					/* so far only the early blocks of this stripe
-					 * have been requested.  When later blocks
-					 * get requested, we will try again
-					 */
-					continue;
-				if(!test_bit(STRIPE_EXPANDING, &sh2->state) ||
-				   test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
-					/* must have already done this block */
-					release_stripe(sh2);
-					continue;
-				}
-				memcpy(page_address(sh2->dev[dd_idx].page),
-				       page_address(sh->dev[i].page),
-				       STRIPE_SIZE);
-				set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
-				set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
-				for (j=0; j<conf->raid_disks; j++)
-					if (j != sh2->pd_idx &&
-					    !test_bit(R5_Expanded, &sh2->dev[j].flags))
-						break;
-				if (j == conf->raid_disks) {
-					set_bit(STRIPE_EXPAND_READY, &sh2->state);
-					set_bit(STRIPE_HANDLE, &sh2->state);
-				}
-				release_stripe(sh2);
-			}
-	}
+	if (s.expanding && s.locked == 0)
+		handle_stripe_expansion(conf, sh, NULL);
 
 	spin_unlock(&sh->lock);
 
-	while ((bi=return_bi)) {
-		int bytes = bi->bi_size;
+	return_io(return_bi);
 
-		return_bi = bi->bi_next;
-		bi->bi_next = NULL;
-		bi->bi_size = 0;
-		bi->bi_end_io(bi, bytes,
-			      test_bit(BIO_UPTODATE, &bi->bi_flags)
-			        ? 0 : -EIO);
-	}
 	for (i=disks; i-- ;) {
 		int rw;
 		struct bio *bi;
@@ -1850,7 +2170,7 @@ static void handle_stripe5(struct stripe_head *sh)
 		rcu_read_unlock();
  
 		if (rdev) {
-			if (syncing || expanding || expanded)
+			if (s.syncing || s.expanding || s.expanded)
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
 			bi->bi_bdev = rdev->bdev;
@@ -1886,29 +2206,26 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 {
 	raid6_conf_t *conf = sh->raid_conf;
 	int disks = sh->disks;
-	struct bio *return_bi= NULL;
-	struct bio *bi;
-	int i;
-	int syncing, expanding, expanded;
-	int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
-	int non_overwrite = 0;
-	int failed_num[2] = {0, 0};
+	struct bio *return_bi = NULL;
+	int i, pd_idx = sh->pd_idx;
+	struct stripe_head_state s;
+	struct r6_state r6s;
 	struct r5dev *dev, *pdev, *qdev;
-	int pd_idx = sh->pd_idx;
-	int qd_idx = raid6_next_disk(pd_idx, disks);
-	int p_failed, q_failed;
 
-	PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n",
-	       (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
-	       pd_idx, qd_idx);
+	r6s.qd_idx = raid6_next_disk(pd_idx, disks);
+	PRINTK("handling stripe %llu, state=%#lx cnt=%d, "
+		"pd_idx=%d, qd_idx=%d\n",
+	       (unsigned long long)sh->sector, sh->state,
+	       atomic_read(&sh->count), pd_idx, r6s.qd_idx);
+	memset(&s, 0, sizeof(s));
 
 	spin_lock(&sh->lock);
 	clear_bit(STRIPE_HANDLE, &sh->state);
 	clear_bit(STRIPE_DELAYED, &sh->state);
 
-	syncing = test_bit(STRIPE_SYNCING, &sh->state);
-	expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-	expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+	s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
+	s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+	s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
 	/* Now to look around and see what can be done */
 
 	rcu_read_lock();
@@ -1943,17 +2260,19 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 		}
 
 		/* now count some things */
-		if (test_bit(R5_LOCKED, &dev->flags)) locked++;
-		if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
+		if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
+		if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
 
 
-		if (dev->toread) to_read++;
+		if (dev->toread)
+			s.to_read++;
 		if (dev->towrite) {
-			to_write++;
+			s.to_write++;
 			if (!test_bit(R5_OVERWRITE, &dev->flags))
-				non_overwrite++;
+				s.non_overwrite++;
 		}
-		if (dev->written) written++;
+		if (dev->written)
+			s.written++;
 		rdev = rcu_dereference(conf->disks[i].rdev);
 		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
 			/* The ReadError flag will just be confusing now */
@@ -1962,96 +2281,27 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 		}
 		if (!rdev || !test_bit(In_sync, &rdev->flags)
 		    || test_bit(R5_ReadError, &dev->flags)) {
-			if ( failed < 2 )
-				failed_num[failed] = i;
-			failed++;
+			if (s.failed < 2)
+				r6s.failed_num[s.failed] = i;
+			s.failed++;
 		} else
 			set_bit(R5_Insync, &dev->flags);
 	}
 	rcu_read_unlock();
 	PRINTK("locked=%d uptodate=%d to_read=%d"
 	       " to_write=%d failed=%d failed_num=%d,%d\n",
-	       locked, uptodate, to_read, to_write, failed,
-	       failed_num[0], failed_num[1]);
-	/* check if the array has lost >2 devices and, if so, some requests might
-	 * need to be failed
+	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
+	       r6s.failed_num[0], r6s.failed_num[1]);
+	/* check if the array has lost >2 devices and, if so, some requests
+	 * might need to be failed
 	 */
-	if (failed > 2 && to_read+to_write+written) {
-		for (i=disks; i--; ) {
-			int bitmap_end = 0;
-
-			if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-				mdk_rdev_t *rdev;
-				rcu_read_lock();
-				rdev = rcu_dereference(conf->disks[i].rdev);
-				if (rdev && test_bit(In_sync, &rdev->flags))
-					/* multiple read failures in one stripe */
-					md_error(conf->mddev, rdev);
-				rcu_read_unlock();
-			}
-
-			spin_lock_irq(&conf->device_lock);
-			/* fail all writes first */
-			bi = sh->dev[i].towrite;
-			sh->dev[i].towrite = NULL;
-			if (bi) { to_write--; bitmap_end = 1; }
-
-			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-				wake_up(&conf->wait_for_overlap);
-
-			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
-				struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
-				clear_bit(BIO_UPTODATE, &bi->bi_flags);
-				if (--bi->bi_phys_segments == 0) {
-					md_write_end(conf->mddev);
-					bi->bi_next = return_bi;
-					return_bi = bi;
-				}
-				bi = nextbi;
-			}
-			/* and fail all 'written' */
-			bi = sh->dev[i].written;
-			sh->dev[i].written = NULL;
-			if (bi) bitmap_end = 1;
-			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
-				struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
-				clear_bit(BIO_UPTODATE, &bi->bi_flags);
-				if (--bi->bi_phys_segments == 0) {
-					md_write_end(conf->mddev);
-					bi->bi_next = return_bi;
-					return_bi = bi;
-				}
-				bi = bi2;
-			}
-
-			/* fail any reads if this device is non-operational */
-			if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
-			    test_bit(R5_ReadError, &sh->dev[i].flags)) {
-				bi = sh->dev[i].toread;
-				sh->dev[i].toread = NULL;
-				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-					wake_up(&conf->wait_for_overlap);
-				if (bi) to_read--;
-				while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
-					struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
-					clear_bit(BIO_UPTODATE, &bi->bi_flags);
-					if (--bi->bi_phys_segments == 0) {
-						bi->bi_next = return_bi;
-						return_bi = bi;
-					}
-					bi = nextbi;
-				}
-			}
-			spin_unlock_irq(&conf->device_lock);
-			if (bitmap_end)
-				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
-						STRIPE_SECTORS, 0, 0);
-		}
-	}
-	if (failed > 2 && syncing) {
+	if (s.failed > 2 && s.to_read+s.to_write+s.written)
+		handle_requests_to_failed_array(conf, sh, &s, disks,
+						&return_bi);
+	if (s.failed > 2 && s.syncing) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
 		clear_bit(STRIPE_SYNCING, &sh->state);
-		syncing = 0;
+		s.syncing = 0;
 	}
 
 	/*
@@ -2059,279 +2309,41 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	 * are safe, or on a failed drive
 	 */
 	pdev = &sh->dev[pd_idx];
-	p_failed = (failed >= 1 && failed_num[0] == pd_idx)
-		|| (failed >= 2 && failed_num[1] == pd_idx);
-	qdev = &sh->dev[qd_idx];
-	q_failed = (failed >= 1 && failed_num[0] == qd_idx)
-		|| (failed >= 2 && failed_num[1] == qd_idx);
-
-	if ( written &&
-	     ( p_failed || ((test_bit(R5_Insync, &pdev->flags)
+	r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
+		|| (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
+	qdev = &sh->dev[r6s.qd_idx];
+	r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx)
+		|| (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx);
+
+	if ( s.written &&
+	     ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
 			     && !test_bit(R5_LOCKED, &pdev->flags)
-			     && test_bit(R5_UPTODATE, &pdev->flags))) ) &&
-	     ( q_failed || ((test_bit(R5_Insync, &qdev->flags)
+			     && test_bit(R5_UPTODATE, &pdev->flags)))) &&
+	     ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
 			     && !test_bit(R5_LOCKED, &qdev->flags)
-			     && test_bit(R5_UPTODATE, &qdev->flags))) ) ) {
-		/* any written block on an uptodate or failed drive can be
-		 * returned.  Note that if we 'wrote' to a failed drive,
-		 * it will be UPTODATE, but never LOCKED, so we don't need
-		 * to test 'failed' directly.
-		 */
-		for (i=disks; i--; )
-			if (sh->dev[i].written) {
-				dev = &sh->dev[i];
-				if (!test_bit(R5_LOCKED, &dev->flags) &&
-				    test_bit(R5_UPTODATE, &dev->flags) ) {
-					/* We can return any write requests */
-					int bitmap_end = 0;
-					struct bio *wbi, *wbi2;
-					PRINTK("Return write for stripe %llu disc %d\n",
-					       (unsigned long long)sh->sector, i);
-					spin_lock_irq(&conf->device_lock);
-					wbi = dev->written;
-					dev->written = NULL;
-					while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
-						wbi2 = r5_next_bio(wbi, dev->sector);
-						if (--wbi->bi_phys_segments == 0) {
-							md_write_end(conf->mddev);
-							wbi->bi_next = return_bi;
-							return_bi = wbi;
-						}
-						wbi = wbi2;
-					}
-					if (dev->towrite == NULL)
-						bitmap_end = 1;
-					spin_unlock_irq(&conf->device_lock);
-					if (bitmap_end)
-						bitmap_endwrite(conf->mddev->bitmap, sh->sector,
-								STRIPE_SECTORS,
-								!test_bit(STRIPE_DEGRADED, &sh->state), 0);
-				}
-			}
-	}
+			     && test_bit(R5_UPTODATE, &qdev->flags)))))
+		handle_completed_write_requests(conf, sh, disks, &return_bi);
 
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
 	 */
-	if (to_read || non_overwrite || (to_write && failed) ||
-	    (syncing && (uptodate < disks)) || expanding) {
-		for (i=disks; i--;) {
-			dev = &sh->dev[i];
-			if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
-			    (dev->toread ||
-			     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
-			     syncing ||
-			     expanding ||
-			     (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
-			     (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
-				    )
-				) {
-				/* we would like to get this block, possibly
-				 * by computing it, but we might not be able to
-				 */
-				if (uptodate == disks-1) {
-					PRINTK("Computing stripe %llu block %d\n",
-					       (unsigned long long)sh->sector, i);
-					compute_block_1(sh, i, 0);
-					uptodate++;
-				} else if ( uptodate == disks-2 && failed >= 2 ) {
-					/* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
-					int other;
-					for (other=disks; other--;) {
-						if ( other == i )
-							continue;
-						if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
-							break;
-					}
-					BUG_ON(other < 0);
-					PRINTK("Computing stripe %llu blocks %d,%d\n",
-					       (unsigned long long)sh->sector, i, other);
-					compute_block_2(sh, i, other);
-					uptodate += 2;
-				} else if (test_bit(R5_Insync, &dev->flags)) {
-					set_bit(R5_LOCKED, &dev->flags);
-					set_bit(R5_Wantread, &dev->flags);
-					locked++;
-					PRINTK("Reading block %d (sync=%d)\n",
-						i, syncing);
-				}
-			}
-		}
-		set_bit(STRIPE_HANDLE, &sh->state);
-	}
+	if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
+	    (s.syncing && (s.uptodate < disks)) || s.expanding)
+		handle_issuing_new_read_requests6(sh, &s, &r6s, disks);
 
 	/* now to consider writing and what else, if anything should be read */
-	if (to_write) {
-		int rcw=0, must_compute=0;
-		for (i=disks ; i--;) {
-			dev = &sh->dev[i];
-			/* Would I have to read this buffer for reconstruct_write */
-			if (!test_bit(R5_OVERWRITE, &dev->flags)
-			    && i != pd_idx && i != qd_idx
-			    && (!test_bit(R5_LOCKED, &dev->flags)
-				    ) &&
-			    !test_bit(R5_UPTODATE, &dev->flags)) {
-				if (test_bit(R5_Insync, &dev->flags)) rcw++;
-				else {
-					PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
-					must_compute++;
-				}
-			}
-		}
-		PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
-		       (unsigned long long)sh->sector, rcw, must_compute);
-		set_bit(STRIPE_HANDLE, &sh->state);
-
-		if (rcw > 0)
-			/* want reconstruct write, but need to get some data */
-			for (i=disks; i--;) {
-				dev = &sh->dev[i];
-				if (!test_bit(R5_OVERWRITE, &dev->flags)
-				    && !(failed == 0 && (i == pd_idx || i == qd_idx))
-				    && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
-				    test_bit(R5_Insync, &dev->flags)) {
-					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
-					{
-						PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
-						       (unsigned long long)sh->sector, i);
-						set_bit(R5_LOCKED, &dev->flags);
-						set_bit(R5_Wantread, &dev->flags);
-						locked++;
-					} else {
-						PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
-						       (unsigned long long)sh->sector, i);
-						set_bit(STRIPE_DELAYED, &sh->state);
-						set_bit(STRIPE_HANDLE, &sh->state);
-					}
-				}
-			}
-		/* now if nothing is locked, and if we have enough data, we can start a write request */
-		if (locked == 0 && rcw == 0 &&
-		    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
-			if ( must_compute > 0 ) {
-				/* We have failed blocks and need to compute them */
-				switch ( failed ) {
-				case 0:	BUG();
-				case 1: compute_block_1(sh, failed_num[0], 0); break;
-				case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
-				default: BUG();	/* This request should have been failed? */
-				}
-			}
-
-			PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
-			compute_parity6(sh, RECONSTRUCT_WRITE);
-			/* now every locked buffer is ready to be written */
-			for (i=disks; i--;)
-				if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
-					PRINTK("Writing stripe %llu block %d\n",
-					       (unsigned long long)sh->sector, i);
-					locked++;
-					set_bit(R5_Wantwrite, &sh->dev[i].flags);
-				}
-			/* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
-			set_bit(STRIPE_INSYNC, &sh->state);
-
-			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-				atomic_dec(&conf->preread_active_stripes);
-				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
-					md_wakeup_thread(conf->mddev->thread);
-			}
-		}
-	}
+	if (s.to_write)
+		handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
-	 * Any reads will already have been scheduled, so we just see if enough data
-	 * is available
+	 * Any reads will already have been scheduled, so we just see if enough
+	 * data is available
 	 */
-	if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
-		int update_p = 0, update_q = 0;
-		struct r5dev *dev;
-
-		set_bit(STRIPE_HANDLE, &sh->state);
+	if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state))
+		handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks);
 
-		BUG_ON(failed>2);
-		BUG_ON(uptodate < disks);
-		/* Want to check and possibly repair P and Q.
-		 * However there could be one 'failed' device, in which
-		 * case we can only check one of them, possibly using the
-		 * other to generate missing data
-		 */
-
-		/* If !tmp_page, we cannot do the calculations,
-		 * but as we have set STRIPE_HANDLE, we will soon be called
-		 * by stripe_handle with a tmp_page - just wait until then.
-		 */
-		if (tmp_page) {
-			if (failed == q_failed) {
-				/* The only possible failed device holds 'Q', so it makes
-				 * sense to check P (If anything else were failed, we would
-				 * have used P to recreate it).
-				 */
-				compute_block_1(sh, pd_idx, 1);
-				if (!page_is_zero(sh->dev[pd_idx].page)) {
-					compute_block_1(sh,pd_idx,0);
-					update_p = 1;
-				}
-			}
-			if (!q_failed && failed < 2) {
-				/* q is not failed, and we didn't use it to generate
-				 * anything, so it makes sense to check it
-				 */
-				memcpy(page_address(tmp_page),
-				       page_address(sh->dev[qd_idx].page),
-				       STRIPE_SIZE);
-				compute_parity6(sh, UPDATE_PARITY);
-				if (memcmp(page_address(tmp_page),
-					   page_address(sh->dev[qd_idx].page),
-					   STRIPE_SIZE)!= 0) {
-					clear_bit(STRIPE_INSYNC, &sh->state);
-					update_q = 1;
-				}
-			}
-			if (update_p || update_q) {
-				conf->mddev->resync_mismatches += STRIPE_SECTORS;
-				if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
-					/* don't try to repair!! */
-					update_p = update_q = 0;
-			}
-
-			/* now write out any block on a failed drive,
-			 * or P or Q if they need it
-			 */
-
-			if (failed == 2) {
-				dev = &sh->dev[failed_num[1]];
-				locked++;
-				set_bit(R5_LOCKED, &dev->flags);
-				set_bit(R5_Wantwrite, &dev->flags);
-			}
-			if (failed >= 1) {
-				dev = &sh->dev[failed_num[0]];
-				locked++;
-				set_bit(R5_LOCKED, &dev->flags);
-				set_bit(R5_Wantwrite, &dev->flags);
-			}
-
-			if (update_p) {
-				dev = &sh->dev[pd_idx];
-				locked ++;
-				set_bit(R5_LOCKED, &dev->flags);
-				set_bit(R5_Wantwrite, &dev->flags);
-			}
-			if (update_q) {
-				dev = &sh->dev[qd_idx];
-				locked++;
-				set_bit(R5_LOCKED, &dev->flags);
-				set_bit(R5_Wantwrite, &dev->flags);
-			}
-			clear_bit(STRIPE_DEGRADED, &sh->state);
-
-			set_bit(STRIPE_INSYNC, &sh->state);
-		}
-	}
-
-	if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
+	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
 		clear_bit(STRIPE_SYNCING, &sh->state);
 	}
@@ -2339,9 +2351,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 	/* If the failed drives are just a ReadError, then we might need
 	 * to progress the repair/check process
 	 */
-	if (failed <= 2 && ! conf->mddev->ro)
-		for (i=0; i<failed;i++) {
-			dev = &sh->dev[failed_num[i]];
+	if (s.failed <= 2 && !conf->mddev->ro)
+		for (i = 0; i < s.failed; i++) {
+			dev = &sh->dev[r6s.failed_num[i]];
 			if (test_bit(R5_ReadError, &dev->flags)
 			    && !test_bit(R5_LOCKED, &dev->flags)
 			    && test_bit(R5_UPTODATE, &dev->flags)
@@ -2358,7 +2370,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 			}
 		}
 
-	if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
 		/* Need to write out all blocks after computing P&Q */
 		sh->disks = conf->raid_disks;
 		sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
@@ -2366,82 +2378,24 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 		compute_parity6(sh, RECONSTRUCT_WRITE);
 		for (i = conf->raid_disks ; i-- ;  ) {
 			set_bit(R5_LOCKED, &sh->dev[i].flags);
-			locked++;
+			s.locked++;
 			set_bit(R5_Wantwrite, &sh->dev[i].flags);
 		}
 		clear_bit(STRIPE_EXPANDING, &sh->state);
-	} else if (expanded) {
+	} else if (s.expanded) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
 		atomic_dec(&conf->reshape_stripes);
 		wake_up(&conf->wait_for_overlap);
 		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
 	}
 
-	if (expanding && locked == 0) {
-		/* We have read all the blocks in this stripe and now we need to
-		 * copy some of them into a target stripe for expand.
-		 */
-		clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-		for (i = 0; i < sh->disks ; i++)
-			if (i != pd_idx && i != qd_idx) {
-				int dd_idx2, pd_idx2, j;
-				struct stripe_head *sh2;
-
-				sector_t bn = compute_blocknr(sh, i);
-				sector_t s = raid5_compute_sector(
-					bn, conf->raid_disks,
-					conf->raid_disks - conf->max_degraded,
-					&dd_idx2, &pd_idx2, conf);
-				sh2 = get_active_stripe(conf, s,
-							conf->raid_disks,
-						       pd_idx2, 1);
-				if (sh2 == NULL)
-					/* so for only the early blocks of
-					 * this stripe have been requests.
-					 * When later blocks get requests, we
-					 * will try again
-					 */
-					continue;
-				if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
-				    test_bit(R5_Expanded,
-					     &sh2->dev[dd_idx2].flags)) {
-					/* must have already done this block */
-					release_stripe(sh2);
-					continue;
-				}
-				memcpy(page_address(sh2->dev[dd_idx2].page),
-				       page_address(sh->dev[i].page),
-				       STRIPE_SIZE);
-				set_bit(R5_Expanded, &sh2->dev[dd_idx2].flags);
-				set_bit(R5_UPTODATE, &sh2->dev[dd_idx2].flags);
-				for (j = 0 ; j < conf->raid_disks ; j++)
-					if (j != sh2->pd_idx &&
-					    j != raid6_next_disk(sh2->pd_idx,
-							   sh2->disks) &&
-					    !test_bit(R5_Expanded,
-						      &sh2->dev[j].flags))
-						break;
-				if (j == conf->raid_disks) {
-					set_bit(STRIPE_EXPAND_READY,
-						&sh2->state);
-					set_bit(STRIPE_HANDLE, &sh2->state);
-				}
-				release_stripe(sh2);
-			}
-	}
+	if (s.expanding && s.locked == 0)
+		handle_stripe_expansion(conf, sh, &r6s);
 
 	spin_unlock(&sh->lock);
 
-	while ((bi=return_bi)) {
-		int bytes = bi->bi_size;
+	return_io(return_bi);
 
-		return_bi = bi->bi_next;
-		bi->bi_next = NULL;
-		bi->bi_size = 0;
-		bi->bi_end_io(bi, bytes,
-			      test_bit(BIO_UPTODATE, &bi->bi_flags)
-			        ? 0 : -EIO);
-	}
 	for (i=disks; i-- ;) {
 		int rw;
 		struct bio *bi;
@@ -2470,7 +2424,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 		rcu_read_unlock();
 
 		if (rdev) {
-			if (syncing || expanding || expanded)
+			if (s.syncing || s.expanding || s.expanded)
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
 			bi->bi_bdev = rdev->bdev;
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index d8286db60b96..b99d354f6128 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -145,6 +145,22 @@ struct stripe_head {
 		unsigned long	flags;
 	} dev[1]; /* allocated with extra space depending of RAID geometry */
 };
+
+/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
+ *     for handle_stripe.  It is only valid under spin_lock(sh->lock);
+ */
+struct stripe_head_state {
+	int syncing, expanding, expanded;
+	int locked, uptodate, to_read, to_write, failed, written;
+	int non_overwrite;
+	int failed_num;
+};
+
+/* r6_state - extra state data only relevant to r6 */
+struct r6_state {
+	int p_failed, q_failed, qd_idx, failed_num[2];
+};
+
 /* Flags */
 #define	R5_UPTODATE	0	/* page contains current data */
 #define	R5_LOCKED	1	/* IO has been submitted on "req" */
-- 
cgit v1.3-8-gc7d7


From 91c00924846a0034020451c280c76baa4299f9dc Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 2 Jan 2007 13:52:30 -0700
Subject: md: raid5_run_ops - run stripe operations outside sh->lock

When the raid acceleration work was proposed, Neil laid out the following
attack plan:

1/ move the xor and copy operations outside spin_lock(&sh->lock)
2/ find/implement an asynchronous offload api

The raid5_run_ops routine uses the asynchronous offload api (async_tx) and
the stripe_operations member of a stripe_head to carry out xor+copy
operations asynchronously, outside the lock.

To perform operations outside the lock a new set of state flags is needed
to track new requests, in-flight requests, and completed requests.  In this
new model handle_stripe is tasked with scanning the stripe_head for work,
updating the stripe_operations structure, and finally dropping the lock and
calling raid5_run_ops for processing.  The following flags outline the
requests that handle_stripe can make of raid5_run_ops:

STRIPE_OP_BIOFILL
 - copy data into request buffers to satisfy a read request
STRIPE_OP_COMPUTE_BLK
 - generate a missing block in the cache from the other blocks
STRIPE_OP_PREXOR
 - subtract existing data as part of the read-modify-write process
STRIPE_OP_BIODRAIN
 - copy data out of request buffers to satisfy a write request
STRIPE_OP_POSTXOR
 - recalculate parity for new data that has entered the cache
STRIPE_OP_CHECK
 - verify that the parity is correct
STRIPE_OP_IO
 - submit i/o to the member disks (note this was already performed outside
   the stripe lock, but it made sense to add it as an operation type

The flow is:
1/ handle_stripe sets STRIPE_OP_* in sh->ops.pending
2/ raid5_run_ops reads sh->ops.pending, sets sh->ops.ack, and submits the
   operation to the async_tx api
3/ async_tx triggers the completion callback routine to set
   sh->ops.complete and release the stripe
4/ handle_stripe runs again to finish the operation and optionally submit
   new operations that were previously blocked

Note this patch just defines raid5_run_ops, subsequent commits (one per
major operation type) modify handle_stripe to take advantage of this
routine.

Changelog:
* removed ops_complete_biodrain in favor of ops_complete_postxor and
  ops_complete_write.
* removed the raid5_run_ops workqueue
* call bi_end_io for reads in ops_complete_biofill, saves a call to
  handle_stripe
* explicitly handle the 2-disk raid5 case (xor becomes memcpy), Neil Brown
* fix race between async engines and bi_end_io call for reads, Neil Brown
* remove unnecessary spin_lock from ops_complete_biofill
* remove test_and_set/test_and_clear BUG_ONs, Neil Brown
* remove explicit interrupt handling for channel switching, this feature
  was absorbed (i.e. it is now implicit) by the async_tx api
* use return_io in ops_complete_biofill

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-By: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c         | 536 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/raid/raid5.h |  81 ++++++-
 2 files changed, 614 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e372e57687ee..0b7002479655 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -52,6 +52,7 @@
 #include "raid6.h"
 
 #include <linux/raid/bitmap.h>
+#include <linux/async_tx.h>
 
 /*
  * Stripe cache
@@ -341,6 +342,541 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 	return sh;
 }
 
+static int
+raid5_end_read_request(struct bio *bi, unsigned int bytes_done, int error);
+static int
+raid5_end_write_request (struct bio *bi, unsigned int bytes_done, int error);
+
+static void ops_run_io(struct stripe_head *sh)
+{
+	raid5_conf_t *conf = sh->raid_conf;
+	int i, disks = sh->disks;
+
+	might_sleep();
+
+	for (i = disks; i--; ) {
+		int rw;
+		struct bio *bi;
+		mdk_rdev_t *rdev;
+		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
+			rw = WRITE;
+		else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
+			rw = READ;
+		else
+			continue;
+
+		bi = &sh->dev[i].req;
+
+		bi->bi_rw = rw;
+		if (rw == WRITE)
+			bi->bi_end_io = raid5_end_write_request;
+		else
+			bi->bi_end_io = raid5_end_read_request;
+
+		rcu_read_lock();
+		rdev = rcu_dereference(conf->disks[i].rdev);
+		if (rdev && test_bit(Faulty, &rdev->flags))
+			rdev = NULL;
+		if (rdev)
+			atomic_inc(&rdev->nr_pending);
+		rcu_read_unlock();
+
+		if (rdev) {
+			if (test_bit(STRIPE_SYNCING, &sh->state) ||
+				test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
+				test_bit(STRIPE_EXPAND_READY, &sh->state))
+				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
+
+			bi->bi_bdev = rdev->bdev;
+			pr_debug("%s: for %llu schedule op %ld on disc %d\n",
+				__FUNCTION__, (unsigned long long)sh->sector,
+				bi->bi_rw, i);
+			atomic_inc(&sh->count);
+			bi->bi_sector = sh->sector + rdev->data_offset;
+			bi->bi_flags = 1 << BIO_UPTODATE;
+			bi->bi_vcnt = 1;
+			bi->bi_max_vecs = 1;
+			bi->bi_idx = 0;
+			bi->bi_io_vec = &sh->dev[i].vec;
+			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
+			bi->bi_io_vec[0].bv_offset = 0;
+			bi->bi_size = STRIPE_SIZE;
+			bi->bi_next = NULL;
+			if (rw == WRITE &&
+			    test_bit(R5_ReWrite, &sh->dev[i].flags))
+				atomic_add(STRIPE_SECTORS,
+					&rdev->corrected_errors);
+			generic_make_request(bi);
+		} else {
+			if (rw == WRITE)
+				set_bit(STRIPE_DEGRADED, &sh->state);
+			pr_debug("skip op %ld on disc %d for sector %llu\n",
+				bi->bi_rw, i, (unsigned long long)sh->sector);
+			clear_bit(R5_LOCKED, &sh->dev[i].flags);
+			set_bit(STRIPE_HANDLE, &sh->state);
+		}
+	}
+}
+
+static struct dma_async_tx_descriptor *
+async_copy_data(int frombio, struct bio *bio, struct page *page,
+	sector_t sector, struct dma_async_tx_descriptor *tx)
+{
+	struct bio_vec *bvl;
+	struct page *bio_page;
+	int i;
+	int page_offset;
+
+	if (bio->bi_sector >= sector)
+		page_offset = (signed)(bio->bi_sector - sector) * 512;
+	else
+		page_offset = (signed)(sector - bio->bi_sector) * -512;
+	bio_for_each_segment(bvl, bio, i) {
+		int len = bio_iovec_idx(bio, i)->bv_len;
+		int clen;
+		int b_offset = 0;
+
+		if (page_offset < 0) {
+			b_offset = -page_offset;
+			page_offset += b_offset;
+			len -= b_offset;
+		}
+
+		if (len > 0 && page_offset + len > STRIPE_SIZE)
+			clen = STRIPE_SIZE - page_offset;
+		else
+			clen = len;
+
+		if (clen > 0) {
+			b_offset += bio_iovec_idx(bio, i)->bv_offset;
+			bio_page = bio_iovec_idx(bio, i)->bv_page;
+			if (frombio)
+				tx = async_memcpy(page, bio_page, page_offset,
+					b_offset, clen,
+					ASYNC_TX_DEP_ACK | ASYNC_TX_KMAP_SRC,
+					tx, NULL, NULL);
+			else
+				tx = async_memcpy(bio_page, page, b_offset,
+					page_offset, clen,
+					ASYNC_TX_DEP_ACK | ASYNC_TX_KMAP_DST,
+					tx, NULL, NULL);
+		}
+		if (clen < len) /* hit end of page */
+			break;
+		page_offset +=  len;
+	}
+
+	return tx;
+}
+
+static void ops_complete_biofill(void *stripe_head_ref)
+{
+	struct stripe_head *sh = stripe_head_ref;
+	struct bio *return_bi = NULL;
+	raid5_conf_t *conf = sh->raid_conf;
+	int i, more_to_read = 0;
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	/* clear completed biofills */
+	for (i = sh->disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		/* check if this stripe has new incoming reads */
+		if (dev->toread)
+			more_to_read++;
+
+		/* acknowledge completion of a biofill operation */
+		/* and check if we need to reply to a read request
+		*/
+		if (test_bit(R5_Wantfill, &dev->flags) && !dev->toread) {
+			struct bio *rbi, *rbi2;
+			clear_bit(R5_Wantfill, &dev->flags);
+
+			/* The access to dev->read is outside of the
+			 * spin_lock_irq(&conf->device_lock), but is protected
+			 * by the STRIPE_OP_BIOFILL pending bit
+			 */
+			BUG_ON(!dev->read);
+			rbi = dev->read;
+			dev->read = NULL;
+			while (rbi && rbi->bi_sector <
+				dev->sector + STRIPE_SECTORS) {
+				rbi2 = r5_next_bio(rbi, dev->sector);
+				spin_lock_irq(&conf->device_lock);
+				if (--rbi->bi_phys_segments == 0) {
+					rbi->bi_next = return_bi;
+					return_bi = rbi;
+				}
+				spin_unlock_irq(&conf->device_lock);
+				rbi = rbi2;
+			}
+		}
+	}
+	clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
+	clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
+
+	return_io(return_bi);
+
+	if (more_to_read)
+		set_bit(STRIPE_HANDLE, &sh->state);
+	release_stripe(sh);
+}
+
+static void ops_run_biofill(struct stripe_head *sh)
+{
+	struct dma_async_tx_descriptor *tx = NULL;
+	raid5_conf_t *conf = sh->raid_conf;
+	int i;
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	for (i = sh->disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		if (test_bit(R5_Wantfill, &dev->flags)) {
+			struct bio *rbi;
+			spin_lock_irq(&conf->device_lock);
+			dev->read = rbi = dev->toread;
+			dev->toread = NULL;
+			spin_unlock_irq(&conf->device_lock);
+			while (rbi && rbi->bi_sector <
+				dev->sector + STRIPE_SECTORS) {
+				tx = async_copy_data(0, rbi, dev->page,
+					dev->sector, tx);
+				rbi = r5_next_bio(rbi, dev->sector);
+			}
+		}
+	}
+
+	atomic_inc(&sh->count);
+	async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
+		ops_complete_biofill, sh);
+}
+
+static void ops_complete_compute5(void *stripe_head_ref)
+{
+	struct stripe_head *sh = stripe_head_ref;
+	int target = sh->ops.target;
+	struct r5dev *tgt = &sh->dev[target];
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	set_bit(R5_UPTODATE, &tgt->flags);
+	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
+	clear_bit(R5_Wantcompute, &tgt->flags);
+	set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
+	set_bit(STRIPE_HANDLE, &sh->state);
+	release_stripe(sh);
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_compute5(struct stripe_head *sh, unsigned long pending)
+{
+	/* kernel stack size limits the total number of disks */
+	int disks = sh->disks;
+	struct page *xor_srcs[disks];
+	int target = sh->ops.target;
+	struct r5dev *tgt = &sh->dev[target];
+	struct page *xor_dest = tgt->page;
+	int count = 0;
+	struct dma_async_tx_descriptor *tx;
+	int i;
+
+	pr_debug("%s: stripe %llu block: %d\n",
+		__FUNCTION__, (unsigned long long)sh->sector, target);
+	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
+
+	for (i = disks; i--; )
+		if (i != target)
+			xor_srcs[count++] = sh->dev[i].page;
+
+	atomic_inc(&sh->count);
+
+	if (unlikely(count == 1))
+		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
+			0, NULL, ops_complete_compute5, sh);
+	else
+		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
+			ASYNC_TX_XOR_ZERO_DST, NULL,
+			ops_complete_compute5, sh);
+
+	/* ack now if postxor is not set to be run */
+	if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending))
+		async_tx_ack(tx);
+
+	return tx;
+}
+
+static void ops_complete_prexor(void *stripe_head_ref)
+{
+	struct stripe_head *sh = stripe_head_ref;
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
+{
+	/* kernel stack size limits the total number of disks */
+	int disks = sh->disks;
+	struct page *xor_srcs[disks];
+	int count = 0, pd_idx = sh->pd_idx, i;
+
+	/* existing parity data subtracted */
+	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	for (i = disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		/* Only process blocks that are known to be uptodate */
+		if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags))
+			xor_srcs[count++] = dev->page;
+	}
+
+	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
+		ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx,
+		ops_complete_prexor, sh);
+
+	return tx;
+}
+
+static struct dma_async_tx_descriptor *
+ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
+{
+	int disks = sh->disks;
+	int pd_idx = sh->pd_idx, i;
+
+	/* check if prexor is active which means only process blocks
+	 * that are part of a read-modify-write (Wantprexor)
+	 */
+	int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	for (i = disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		struct bio *chosen;
+		int towrite;
+
+		towrite = 0;
+		if (prexor) { /* rmw */
+			if (dev->towrite &&
+			    test_bit(R5_Wantprexor, &dev->flags))
+				towrite = 1;
+		} else { /* rcw */
+			if (i != pd_idx && dev->towrite &&
+				test_bit(R5_LOCKED, &dev->flags))
+				towrite = 1;
+		}
+
+		if (towrite) {
+			struct bio *wbi;
+
+			spin_lock(&sh->lock);
+			chosen = dev->towrite;
+			dev->towrite = NULL;
+			BUG_ON(dev->written);
+			wbi = dev->written = chosen;
+			spin_unlock(&sh->lock);
+
+			while (wbi && wbi->bi_sector <
+				dev->sector + STRIPE_SECTORS) {
+				tx = async_copy_data(1, wbi, dev->page,
+					dev->sector, tx);
+				wbi = r5_next_bio(wbi, dev->sector);
+			}
+		}
+	}
+
+	return tx;
+}
+
+static void ops_complete_postxor(void *stripe_head_ref)
+{
+	struct stripe_head *sh = stripe_head_ref;
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
+	set_bit(STRIPE_HANDLE, &sh->state);
+	release_stripe(sh);
+}
+
+static void ops_complete_write(void *stripe_head_ref)
+{
+	struct stripe_head *sh = stripe_head_ref;
+	int disks = sh->disks, i, pd_idx = sh->pd_idx;
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	for (i = disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		if (dev->written || i == pd_idx)
+			set_bit(R5_UPTODATE, &dev->flags);
+	}
+
+	set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
+	set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
+
+	set_bit(STRIPE_HANDLE, &sh->state);
+	release_stripe(sh);
+}
+
+static void
+ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
+{
+	/* kernel stack size limits the total number of disks */
+	int disks = sh->disks;
+	struct page *xor_srcs[disks];
+
+	int count = 0, pd_idx = sh->pd_idx, i;
+	struct page *xor_dest;
+	int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
+	unsigned long flags;
+	dma_async_tx_callback callback;
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	/* check if prexor is active which means only process blocks
+	 * that are part of a read-modify-write (written)
+	 */
+	if (prexor) {
+		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			if (dev->written)
+				xor_srcs[count++] = dev->page;
+		}
+	} else {
+		xor_dest = sh->dev[pd_idx].page;
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			if (i != pd_idx)
+				xor_srcs[count++] = dev->page;
+		}
+	}
+
+	/* check whether this postxor is part of a write */
+	callback = test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending) ?
+		ops_complete_write : ops_complete_postxor;
+
+	/* 1/ if we prexor'd then the dest is reused as a source
+	 * 2/ if we did not prexor then we are redoing the parity
+	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
+	 * for the synchronous xor case
+	 */
+	flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK |
+		(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
+
+	atomic_inc(&sh->count);
+
+	if (unlikely(count == 1)) {
+		flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
+		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
+			flags, tx, callback, sh);
+	} else
+		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
+			flags, tx, callback, sh);
+}
+
+static void ops_complete_check(void *stripe_head_ref)
+{
+	struct stripe_head *sh = stripe_head_ref;
+	int pd_idx = sh->pd_idx;
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) &&
+		sh->ops.zero_sum_result == 0)
+		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+
+	set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
+	set_bit(STRIPE_HANDLE, &sh->state);
+	release_stripe(sh);
+}
+
+static void ops_run_check(struct stripe_head *sh)
+{
+	/* kernel stack size limits the total number of disks */
+	int disks = sh->disks;
+	struct page *xor_srcs[disks];
+	struct dma_async_tx_descriptor *tx;
+
+	int count = 0, pd_idx = sh->pd_idx, i;
+	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
+
+	pr_debug("%s: stripe %llu\n", __FUNCTION__,
+		(unsigned long long)sh->sector);
+
+	for (i = disks; i--; ) {
+		struct r5dev *dev = &sh->dev[i];
+		if (i != pd_idx)
+			xor_srcs[count++] = dev->page;
+	}
+
+	tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
+		&sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
+
+	if (tx)
+		set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
+	else
+		clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
+
+	atomic_inc(&sh->count);
+	tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
+		ops_complete_check, sh);
+}
+
+static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
+{
+	int overlap_clear = 0, i, disks = sh->disks;
+	struct dma_async_tx_descriptor *tx = NULL;
+
+	if (test_bit(STRIPE_OP_BIOFILL, &pending)) {
+		ops_run_biofill(sh);
+		overlap_clear++;
+	}
+
+	if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending))
+		tx = ops_run_compute5(sh, pending);
+
+	if (test_bit(STRIPE_OP_PREXOR, &pending))
+		tx = ops_run_prexor(sh, tx);
+
+	if (test_bit(STRIPE_OP_BIODRAIN, &pending)) {
+		tx = ops_run_biodrain(sh, tx);
+		overlap_clear++;
+	}
+
+	if (test_bit(STRIPE_OP_POSTXOR, &pending))
+		ops_run_postxor(sh, tx);
+
+	if (test_bit(STRIPE_OP_CHECK, &pending))
+		ops_run_check(sh);
+
+	if (test_bit(STRIPE_OP_IO, &pending))
+		ops_run_io(sh);
+
+	if (overlap_clear)
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_and_clear_bit(R5_Overlap, &dev->flags))
+				wake_up(&sh->raid_conf->wait_for_overlap);
+		}
+}
+
 static int grow_one_stripe(raid5_conf_t *conf)
 {
 	struct stripe_head *sh;
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index b99d354f6128..6fb9d94e6f2e 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -116,13 +116,46 @@
  *  attach a request to an active stripe (add_stripe_bh())
  *     lockdev attach-buffer unlockdev
  *  handle a stripe (handle_stripe())
- *     lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io needed unlockstripe schedule io
+ *     lockstripe clrSTRIPE_HANDLE ...
+ *		(lockdev check-buffers unlockdev) ..
+ *		change-state ..
+ *		record io/ops needed unlockstripe schedule io/ops
  *  release an active stripe (release_stripe())
  *     lockdev if (!--cnt) { if  STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
  *
  * The refcount counts each thread that have activated the stripe,
  * plus raid5d if it is handling it, plus one for each active request
- * on a cached buffer.
+ * on a cached buffer, and plus one if the stripe is undergoing stripe
+ * operations.
+ *
+ * Stripe operations are performed outside the stripe lock,
+ * the stripe operations are:
+ * -copying data between the stripe cache and user application buffers
+ * -computing blocks to save a disk access, or to recover a missing block
+ * -updating the parity on a write operation (reconstruct write and
+ *  read-modify-write)
+ * -checking parity correctness
+ * -running i/o to disk
+ * These operations are carried out by raid5_run_ops which uses the async_tx
+ * api to (optionally) offload operations to dedicated hardware engines.
+ * When requesting an operation handle_stripe sets the pending bit for the
+ * operation and increments the count.  raid5_run_ops is then run whenever
+ * the count is non-zero.
+ * There are some critical dependencies between the operations that prevent some
+ * from being requested while another is in flight.
+ * 1/ Parity check operations destroy the in cache version of the parity block,
+ *    so we prevent parity dependent operations like writes and compute_blocks
+ *    from starting while a check is in progress.  Some dma engines can perform
+ *    the check without damaging the parity block, in these cases the parity
+ *    block is re-marked up to date (assuming the check was successful) and is
+ *    not re-read from disk.
+ * 2/ When a write operation is requested we immediately lock the affected
+ *    blocks, and mark them as not up to date.  This causes new read requests
+ *    to be held off, as well as parity checks and compute block operations.
+ * 3/ Once a compute block operation has been requested handle_stripe treats
+ *    that block as if it is up to date.  raid5_run_ops guaruntees that any
+ *    operation that is dependent on the compute block result is initiated after
+ *    the compute block completes.
  */
 
 struct stripe_head {
@@ -136,11 +169,26 @@ struct stripe_head {
 	spinlock_t		lock;
 	int			bm_seq;	/* sequence number for bitmap flushes */
 	int			disks;			/* disks in stripe */
+	/* stripe_operations
+	 * @pending - pending ops flags (set for request->issue->complete)
+	 * @ack - submitted ops flags (set for issue->complete)
+	 * @complete - completed ops flags (set for complete)
+	 * @target - STRIPE_OP_COMPUTE_BLK target
+	 * @count - raid5_runs_ops is set to run when this is non-zero
+	 */
+	struct stripe_operations {
+		unsigned long	   pending;
+		unsigned long	   ack;
+		unsigned long	   complete;
+		int		   target;
+		int		   count;
+		u32		   zero_sum_result;
+	} ops;
 	struct r5dev {
 		struct bio	req;
 		struct bio_vec	vec;
 		struct page	*page;
-		struct bio	*toread, *towrite, *written;
+		struct bio	*toread, *read, *towrite, *written;
 		sector_t	sector;			/* sector of this page */
 		unsigned long	flags;
 	} dev[1]; /* allocated with extra space depending of RAID geometry */
@@ -174,6 +222,15 @@ struct r6_state {
 #define	R5_ReWrite	9	/* have tried to over-write the readerror */
 
 #define	R5_Expanded	10	/* This block now has post-expand data */
+#define	R5_Wantcompute	11 /* compute_block in progress treat as
+				    * uptodate
+				    */
+#define	R5_Wantfill	12 /* dev->toread contains a bio that needs
+				    * filling
+				    */
+#define	R5_Wantprexor	13 /* distinguish blocks ready for rmw from
+				    * other "towrites"
+				    */
 /*
  * Write method
  */
@@ -195,6 +252,24 @@ struct r6_state {
 #define	STRIPE_EXPANDING	9
 #define	STRIPE_EXPAND_SOURCE	10
 #define	STRIPE_EXPAND_READY	11
+/*
+ * Operations flags (in issue order)
+ */
+#define STRIPE_OP_BIOFILL	0
+#define STRIPE_OP_COMPUTE_BLK	1
+#define STRIPE_OP_PREXOR	2
+#define STRIPE_OP_BIODRAIN	3
+#define STRIPE_OP_POSTXOR	4
+#define STRIPE_OP_CHECK	5
+#define STRIPE_OP_IO		6
+
+/* modifiers to the base operations
+ * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back
+ * STRIPE_OP_MOD_DMA_CHECK - parity is not corrupted by the check
+ */
+#define STRIPE_OP_MOD_REPAIR_PD 7
+#define STRIPE_OP_MOD_DMA_CHECK 8
+
 /*
  * Plugging:
  *
-- 
cgit v1.3-8-gc7d7


From f38e12199a94ca458e4f03c5a2c984fb80adadc5 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 2 Jan 2007 13:52:30 -0700
Subject: md: handle_stripe5 - add request/completion logic for async compute
 ops

handle_stripe will compute a block when a backing disk has failed, or when
it determines it can save a disk read by computing the block from all the
other up-to-date blocks.

Previously a block would be computed under the lock and subsequent logic in
handle_stripe could use the newly up-to-date block.  With the raid5_run_ops
implementation the compute operation is carried out a later time outside
the lock.  To preserve the old functionality we take advantage of the
dependency chain feature of async_tx to flag the block as R5_Wantcompute
and then let other parts of handle_stripe operate on the block as if it
were up-to-date.  raid5_run_ops guarantees that the block will be ready
before it is used in another operation.

However, this only works in cases where the compute and the dependent
operation are scheduled at the same time.  If a previous call to
handle_stripe sets the R5_Wantcompute flag there is no facility to pass the
async_tx dependency chain across successive calls to raid5_run_ops.  The
req_compute variable protects against this case.

Changelog:
* remove the req_compute BUG_ON

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-By: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c         | 149 ++++++++++++++++++++++++++++++++++-----------
 include/linux/raid/raid5.h |   2 +-
 2 files changed, 115 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d9521aa69461..42439a4c1c51 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2077,36 +2077,101 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
 
 }
 
+/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks
+ * to process
+ */
+static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
+			struct stripe_head_state *s, int disk_idx, int disks)
+{
+	struct r5dev *dev = &sh->dev[disk_idx];
+	struct r5dev *failed_dev = &sh->dev[s->failed_num];
+
+	/* don't schedule compute operations or reads on the parity block while
+	 * a check is in flight
+	 */
+	if ((disk_idx == sh->pd_idx) &&
+	     test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
+		return ~0;
+
+	/* is the data in this block needed, and can we get it? */
+	if (!test_bit(R5_LOCKED, &dev->flags) &&
+	    !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread ||
+	    (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
+	     s->syncing || s->expanding || (s->failed &&
+	     (failed_dev->toread || (failed_dev->towrite &&
+	     !test_bit(R5_OVERWRITE, &failed_dev->flags)
+	     ))))) {
+		/* 1/ We would like to get this block, possibly by computing it,
+		 * but we might not be able to.
+		 *
+		 * 2/ Since parity check operations potentially make the parity
+		 * block !uptodate it will need to be refreshed before any
+		 * compute operations on data disks are scheduled.
+		 *
+		 * 3/ We hold off parity block re-reads until check operations
+		 * have quiesced.
+		 */
+		if ((s->uptodate == disks - 1) &&
+		    !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
+			set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
+			set_bit(R5_Wantcompute, &dev->flags);
+			sh->ops.target = disk_idx;
+			s->req_compute = 1;
+			sh->ops.count++;
+			/* Careful: from this point on 'uptodate' is in the eye
+			 * of raid5_run_ops which services 'compute' operations
+			 * before writes. R5_Wantcompute flags a block that will
+			 * be R5_UPTODATE by the time it is needed for a
+			 * subsequent operation.
+			 */
+			s->uptodate++;
+			return 0; /* uptodate + compute == disks */
+		} else if ((s->uptodate < disks - 1) &&
+			test_bit(R5_Insync, &dev->flags)) {
+			/* Note: we hold off compute operations while checks are
+			 * in flight, but we still prefer 'compute' over 'read'
+			 * hence we only read if (uptodate < * disks-1)
+			 */
+			set_bit(R5_LOCKED, &dev->flags);
+			set_bit(R5_Wantread, &dev->flags);
+			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+				sh->ops.count++;
+			s->locked++;
+			pr_debug("Reading block %d (sync=%d)\n", disk_idx,
+				s->syncing);
+		}
+	}
+
+	return ~0;
+}
+
 static void handle_issuing_new_read_requests5(struct stripe_head *sh,
 			struct stripe_head_state *s, int disks)
 {
 	int i;
-	for (i = disks; i--; ) {
-		struct r5dev *dev = &sh->dev[i];
-		if (!test_bit(R5_LOCKED, &dev->flags) &&
-		    !test_bit(R5_UPTODATE, &dev->flags) &&
-		    (dev->toread ||
-		     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
-		     s->syncing || s->expanding ||
-		     (s->failed && (sh->dev[s->failed_num].toread ||
-			(sh->dev[s->failed_num].towrite &&
-			!test_bit(R5_OVERWRITE, &sh->dev[s->failed_num].flags))
-		      )))) {
-			/* we would like to get this block, possibly
-			 * by computing it, but we might not be able to
-			 */
-			if (s->uptodate == disks-1) {
-				pr_debug("Computing block %d\n", i);
-				compute_block(sh, i);
-				s->uptodate++;
-			} else if (test_bit(R5_Insync, &dev->flags)) {
-				set_bit(R5_LOCKED, &dev->flags);
-				set_bit(R5_Wantread, &dev->flags);
-				s->locked++;
-				pr_debug("Reading block %d (sync=%d)\n",
-					i, s->syncing);
-			}
-		}
+
+	/* Clear completed compute operations.  Parity recovery
+	 * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled
+	 * later on in this routine
+	 */
+	if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
+		!test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
+		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
+		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
+		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
+	}
+
+	/* look for blocks to read/compute, skip this if a compute
+	 * is already in flight, or if the stripe contents are in the
+	 * midst of changing due to a write
+	 */
+	if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
+		!test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
+		!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+		for (i = disks; i--; )
+			if (__handle_issuing_new_read_requests5(
+				sh, s, i, disks) == 0)
+				break;
 	}
 	set_bit(STRIPE_HANDLE, &sh->state);
 }
@@ -2223,7 +2288,8 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 		struct r5dev *dev = &sh->dev[i];
 		if ((dev->towrite || i == sh->pd_idx) &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
-		    !test_bit(R5_UPTODATE, &dev->flags)) {
+		    !(test_bit(R5_UPTODATE, &dev->flags) ||
+		      test_bit(R5_Wantcompute, &dev->flags))) {
 			if (test_bit(R5_Insync, &dev->flags))
 				rmw++;
 			else
@@ -2232,9 +2298,9 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 		/* Would I have to read this buffer for reconstruct_write */
 		if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
-		    !test_bit(R5_UPTODATE, &dev->flags)) {
-			if (test_bit(R5_Insync, &dev->flags))
-				rcw++;
+		    !(test_bit(R5_UPTODATE, &dev->flags) ||
+		    test_bit(R5_Wantcompute, &dev->flags))) {
+			if (test_bit(R5_Insync, &dev->flags)) rcw++;
 			else
 				rcw += 2*disks;
 		}
@@ -2248,7 +2314,8 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 			struct r5dev *dev = &sh->dev[i];
 			if ((dev->towrite || i == sh->pd_idx) &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
-			    !test_bit(R5_UPTODATE, &dev->flags) &&
+			    !(test_bit(R5_UPTODATE, &dev->flags) ||
+			    test_bit(R5_Wantcompute, &dev->flags)) &&
 			    test_bit(R5_Insync, &dev->flags)) {
 				if (
 				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
@@ -2270,7 +2337,8 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 			if (!test_bit(R5_OVERWRITE, &dev->flags) &&
 			    i != sh->pd_idx &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
-			    !test_bit(R5_UPTODATE, &dev->flags) &&
+			    !(test_bit(R5_UPTODATE, &dev->flags) ||
+			    test_bit(R5_Wantcompute, &dev->flags)) &&
 			    test_bit(R5_Insync, &dev->flags)) {
 				if (
 				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
@@ -2288,8 +2356,17 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 	/* now if nothing is locked, and if we have enough data,
 	 * we can start a write request
 	 */
-	if (s->locked == 0 && (rcw == 0 || rmw == 0) &&
-	    !test_bit(STRIPE_BIT_DELAY, &sh->state))
+	/* since handle_stripe can be called at any time we need to handle the
+	 * case where a compute block operation has been submitted and then a
+	 * subsequent call wants to start a write request.  raid5_run_ops only
+	 * handles the case where compute block and postxor are requested
+	 * simultaneously.  If this is not the case then new writes need to be
+	 * held off until the compute completes.
+	 */
+	if ((s->req_compute ||
+	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) &&
+		(s->locked == 0 && (rcw == 0 || rmw == 0) &&
+		!test_bit(STRIPE_BIT_DELAY, &sh->state)))
 		s->locked += handle_write_operations5(sh, rcw == 0, 0);
 }
 
@@ -2650,6 +2727,7 @@ static void handle_stripe5(struct stripe_head *sh)
 		/* now count some things */
 		if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
 		if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
+		if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++;
 
 		if (dev->toread)
 			s.to_read++;
@@ -2706,7 +2784,8 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * or to load a block that is being partially written.
 	 */
 	if (s.to_read || s.non_overwrite ||
-		(s.syncing && (s.uptodate < disks)) || s.expanding)
+	    (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding ||
+	    test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
 		handle_issuing_new_read_requests5(sh, &s, disks);
 
 	/* Now we check to see if any write operations have recently
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 6fb9d94e6f2e..2293015de1d5 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -200,7 +200,7 @@ struct stripe_head {
 struct stripe_head_state {
 	int syncing, expanding, expanded;
 	int locked, uptodate, to_read, to_write, failed, written;
-	int non_overwrite;
+	int compute, req_compute, non_overwrite;
 	int failed_num;
 };
 
-- 
cgit v1.3-8-gc7d7


From b5e98d65d34a1c11a2135ea8a9b2619dbc7216c8 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 2 Jan 2007 13:52:31 -0700
Subject: md: handle_stripe5 - add request/completion logic for async read ops

When a read bio is attached to the stripe and the corresponding block is
marked R5_UPTODATE, then a read (biofill) operation is scheduled to copy
the data from the stripe cache to the bio buffer.  handle_stripe flags the
blocks to be operated on with the R5_Wantfill flag.  If new read requests
arrive while raid5_run_ops is running they will not be handled until
handle_stripe is scheduled to run again.

Changelog:
* cleanup to_read and to_fill accounting
* do not fail reads that have reached the cache

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-By: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c         | 53 ++++++++++++++++++++++------------------------
 include/linux/raid/raid5.h |  2 +-
 2 files changed, 26 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 810cf831edda..a33dac7c2e2f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2049,9 +2049,12 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
 			bi = bi2;
 		}
 
-		/* fail any reads if this device is non-operational */
-		if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
-		    test_bit(R5_ReadError, &sh->dev[i].flags)) {
+		/* fail any reads if this device is non-operational and
+		 * the data has not reached the cache yet.
+		 */
+		if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
+		    (!test_bit(R5_Insync, &sh->dev[i].flags) ||
+		      test_bit(R5_ReadError, &sh->dev[i].flags))) {
 			bi = sh->dev[i].toread;
 			sh->dev[i].toread = NULL;
 			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
@@ -2740,37 +2743,27 @@ static void handle_stripe5(struct stripe_head *sh)
 		struct r5dev *dev = &sh->dev[i];
 		clear_bit(R5_Insync, &dev->flags);
 
-		pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
-			i, dev->flags, dev->toread, dev->towrite, dev->written);
-		/* maybe we can reply to a read */
-		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
-			struct bio *rbi, *rbi2;
-			pr_debug("Return read for disc %d\n", i);
-			spin_lock_irq(&conf->device_lock);
-			rbi = dev->toread;
-			dev->toread = NULL;
-			if (test_and_clear_bit(R5_Overlap, &dev->flags))
-				wake_up(&conf->wait_for_overlap);
-			spin_unlock_irq(&conf->device_lock);
-			while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
-				copy_data(0, rbi, dev->page, dev->sector);
-				rbi2 = r5_next_bio(rbi, dev->sector);
-				spin_lock_irq(&conf->device_lock);
-				if (--rbi->bi_phys_segments == 0) {
-					rbi->bi_next = return_bi;
-					return_bi = rbi;
-				}
-				spin_unlock_irq(&conf->device_lock);
-				rbi = rbi2;
-			}
-		}
+		pr_debug("check %d: state 0x%lx toread %p read %p write %p "
+			"written %p\n",	i, dev->flags, dev->toread, dev->read,
+			dev->towrite, dev->written);
+
+		/* maybe we can request a biofill operation
+		 *
+		 * new wantfill requests are only permitted while
+		 * STRIPE_OP_BIOFILL is clear
+		 */
+		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
+			!test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
+			set_bit(R5_Wantfill, &dev->flags);
 
 		/* now count some things */
 		if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
 		if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
 		if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++;
 
-		if (dev->toread)
+		if (test_bit(R5_Wantfill, &dev->flags))
+			s.to_fill++;
+		else if (dev->toread)
 			s.to_read++;
 		if (dev->towrite) {
 			s.to_write++;
@@ -2793,6 +2786,10 @@ static void handle_stripe5(struct stripe_head *sh)
 			set_bit(R5_Insync, &dev->flags);
 	}
 	rcu_read_unlock();
+
+	if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
+		sh->ops.count++;
+
 	pr_debug("locked=%d uptodate=%d to_read=%d"
 		" to_write=%d failed=%d failed_num=%d\n",
 		s.locked, s.uptodate, s.to_read, s.to_write,
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 2293015de1d5..93678f57ccbe 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -200,7 +200,7 @@ struct stripe_head {
 struct stripe_head_state {
 	int syncing, expanding, expanded;
 	int locked, uptodate, to_read, to_write, failed, written;
-	int compute, req_compute, non_overwrite;
+	int to_fill, compute, req_compute, non_overwrite;
 	int failed_num;
 };
 
-- 
cgit v1.3-8-gc7d7


From 3039f0735a280b54c7364fbfe6a9287f7f0b510a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 13 Jul 2007 08:06:19 -0700
Subject: ioatdma: add the unisys "i/oat" pci vendor/device id

Cc: John Magolan <john.magolan@unisys.com>
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/ioatdma.c   | 2 ++
 include/linux/pci_ids.h | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index 81810b3042f1..5fbe56b5cea0 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -517,6 +517,8 @@ static enum dma_status ioat_dma_is_complete(struct dma_chan *chan,
 
 static struct pci_device_id ioat_pci_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_UNISYS,
+		     PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR) },
 	{ 0, }
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 5b1c9994f89a..0275f6917c8e 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -475,6 +475,9 @@
 #define PCI_DEVICE_ID_IBM_ICOM_V2_ONE_PORT_RVX_ONE_PORT_MDM_PCIE 0x0361
 #define PCI_DEVICE_ID_IBM_ICOM_FOUR_PORT_MODEL	0x252
 
+#define PCI_VENDOR_ID_UNISYS		0x1018
+#define PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR 0x001C
+
 #define PCI_VENDOR_ID_COMPEX2		0x101a /* pci.ids says "AT&T GIS (NCR)" */
 #define PCI_DEVICE_ID_COMPEX2_100VG	0x0005
 
-- 
cgit v1.3-8-gc7d7


From f787a50306680c187cf2896a8017937c1bf6dc7e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 11 Jul 2007 21:21:47 +0200
Subject: [PATCH] sched: small topology.h cleanup

trivial cleanup: LOCAL_DISTANCE and REMOTE_DISTANCE are only used in
topology.h and inside an #ifndef section - limit their existence to
that #ifndef.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/topology.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/topology.h b/include/linux/topology.h
index da6c39b2d051..d0890a7e5bab 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -50,10 +50,10 @@
 	for_each_online_node(node)						\
 		if (nr_cpus_node(node))
 
-#ifndef node_distance
 /* Conform to ACPI 2.0 SLIT distance definitions */
 #define LOCAL_DISTANCE		10
 #define REMOTE_DISTANCE		20
+#ifndef node_distance
 #define node_distance(from,to)	((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
 #endif
 #ifndef RECLAIM_DISTANCE
-- 
cgit v1.3-8-gc7d7


From 24023451c8df726692e2f52288a20870d13b501f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sat, 14 Jul 2007 18:51:31 -0700
Subject: [NET]: Add net_device change_rx_mode callback

Currently the set_multicast_list (and set_rx_mode) callbacks are
responsible for configuring the device according to the IFF_PROMISC,
IFF_MULTICAST and IFF_ALLMULTI flags and the mc_list (and uc_list in
case of set_rx_mode).

These callbacks can be invoked from BH context without the rtnl_mutex
by dev_mc_add/dev_mc_delete, which makes reading the device flags and
promiscous/allmulti count racy. For real hardware drivers that just
commit all changes to the hardware this is not a real problem since
the stack guarantees to call them for every change, so at least the
final call will not race and commit the correct configuration to the
hardware.

For software devices that want to synchronize promiscous and multicast
state to an underlying device however this can cause corruption of the
underlying device's flags or promisc/allmulti counts.

When the software device is concurrently put in promiscous or allmulti
mode while set_multicast_list is invoked from bottem half context, the
device might synchronize the change to the underlying device without
holding the rtnl_mutex, which races with concurrent changes to the
underlying device.

Add a dev->change_rx_flags hook that is invoked when any of the flags
that affect rx filtering change (under the rtnl_mutex), which allows
drivers to perform synchronization immediately and only synchronize
the address lists in set_multicast_list/set_rx_mode.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  3 +++
 net/core/dev.c            | 17 ++++++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 79cc3dab4be7..f193aba30384 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -516,6 +516,9 @@ struct net_device
 						void *saddr,
 						unsigned len);
 	int			(*rebuild_header)(struct sk_buff *skb);
+#define HAVE_CHANGE_RX_FLAGS
+	void			(*change_rx_flags)(struct net_device *dev,
+						   int flags);
 #define HAVE_SET_RX_MODE
 	void			(*set_rx_mode)(struct net_device *dev);
 #define HAVE_MULTICAST			 
diff --git a/net/core/dev.c b/net/core/dev.c
index 96443055324e..59ec811d2b54 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2521,6 +2521,8 @@ static void __dev_set_promiscuity(struct net_device *dev, int inc)
 {
 	unsigned short old_flags = dev->flags;
 
+	ASSERT_RTNL();
+
 	if ((dev->promiscuity += inc) == 0)
 		dev->flags &= ~IFF_PROMISC;
 	else
@@ -2535,6 +2537,9 @@ static void __dev_set_promiscuity(struct net_device *dev, int inc)
 			dev->name, (dev->flags & IFF_PROMISC),
 			(old_flags & IFF_PROMISC),
 			audit_get_loginuid(current->audit_context));
+
+		if (dev->change_rx_flags)
+			dev->change_rx_flags(dev, IFF_PROMISC);
 	}
 }
 
@@ -2573,11 +2578,16 @@ void dev_set_allmulti(struct net_device *dev, int inc)
 {
 	unsigned short old_flags = dev->flags;
 
+	ASSERT_RTNL();
+
 	dev->flags |= IFF_ALLMULTI;
 	if ((dev->allmulti += inc) == 0)
 		dev->flags &= ~IFF_ALLMULTI;
-	if (dev->flags ^ old_flags)
+	if (dev->flags ^ old_flags) {
+		if (dev->change_rx_flags)
+			dev->change_rx_flags(dev, IFF_ALLMULTI);
 		dev_set_rx_mode(dev);
+	}
 }
 
 /*
@@ -2778,6 +2788,8 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
 	int ret, changes;
 	int old_flags = dev->flags;
 
+	ASSERT_RTNL();
+
 	/*
 	 *	Set the flags on our device.
 	 */
@@ -2792,6 +2804,9 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
 	 *	Load in the correct multicast list now the flags have changed.
 	 */
 
+	if (dev->change_rx_flags && (dev->flags ^ flags) & IFF_MULTICAST)
+		dev->change_rx_flags(dev, IFF_MULTICAST);
+
 	dev_set_rx_mode(dev);
 
 	/*
-- 
cgit v1.3-8-gc7d7


From a0a400d79e3dd7843e7e81baa3ef2957bdc292d0 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sat, 14 Jul 2007 18:52:02 -0700
Subject: [NET]: dev_mcast: add multicast list synchronization helpers

The method drivers currently use to synchronize multicast lists is not
very pretty:

- walk the multicast list
- search each entry on a copy of the previous list
- if new add to lower device
- walk the copy of the previous list
- search each entry on the current list
- if removed delete from lower device
- copy entire list

This patch adds a new field to struct dev_addr_list to store the
synchronization state and adds two helper functions for synchronization
and cleanup.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  3 ++
 net/core/dev_mcast.c      | 75 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f193aba30384..e5af458ab04b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -190,6 +190,7 @@ struct dev_addr_list
 	struct dev_addr_list	*next;
 	u8			da_addr[MAX_ADDR_LEN];
 	u8			da_addrlen;
+	u8			da_synced;
 	int			da_users;
 	int			da_gusers;
 };
@@ -1103,6 +1104,8 @@ extern int		dev_unicast_delete(struct net_device *dev, void *addr, int alen);
 extern int		dev_unicast_add(struct net_device *dev, void *addr, int alen);
 extern int 		dev_mc_delete(struct net_device *dev, void *addr, int alen, int all);
 extern int		dev_mc_add(struct net_device *dev, void *addr, int alen, int newonly);
+extern int		dev_mc_sync(struct net_device *to, struct net_device *from);
+extern void		dev_mc_unsync(struct net_device *to, struct net_device *from);
 extern void		dev_mc_discard(struct net_device *dev);
 extern int 		__dev_addr_delete(struct dev_addr_list **list, int *count, void *addr, int alen, int all);
 extern int		__dev_addr_add(struct dev_addr_list **list, int *count, void *addr, int alen, int newonly);
diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
index aa38100601fb..235a2a8a0d05 100644
--- a/net/core/dev_mcast.c
+++ b/net/core/dev_mcast.c
@@ -102,6 +102,81 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
 	return err;
 }
 
+/**
+ *	dev_mc_sync	- Synchronize device's multicast list to another device
+ *	@to: destination device
+ *	@from: source device
+ *
+ * 	Add newly added addresses to the destination device and release
+ * 	addresses that have no users left. The source device must be
+ * 	locked by netif_tx_lock_bh.
+ *
+ *	This function is intended to be called from the dev->set_multicast_list
+ *	function of layered software devices.
+ */
+int dev_mc_sync(struct net_device *to, struct net_device *from)
+{
+	struct dev_addr_list *da;
+	int err = 0;
+
+	netif_tx_lock_bh(to);
+	for (da = from->mc_list; da != NULL; da = da->next) {
+		if (!da->da_synced) {
+			err = __dev_addr_add(&to->mc_list, &to->mc_count,
+					     da->da_addr, da->da_addrlen, 0);
+			if (err < 0)
+				break;
+			da->da_synced = 1;
+			da->da_users++;
+		} else if (da->da_users == 1) {
+			__dev_addr_delete(&to->mc_list, &to->mc_count,
+					  da->da_addr, da->da_addrlen, 0);
+			__dev_addr_delete(&from->mc_list, &from->mc_count,
+					  da->da_addr, da->da_addrlen, 0);
+		}
+	}
+	if (!err)
+		__dev_set_rx_mode(to);
+	netif_tx_unlock_bh(to);
+
+	return err;
+}
+EXPORT_SYMBOL(dev_mc_sync);
+
+
+/**
+ * 	dev_mc_unsync	- Remove synchronized addresses from the destination
+ * 			  device
+ *	@to: destination device
+ *	@from: source device
+ *
+ * 	Remove all addresses that were added to the destination device by
+ * 	dev_mc_sync(). This function is intended to be called from the
+ * 	dev->stop function of layered software devices.
+ */
+void dev_mc_unsync(struct net_device *to, struct net_device *from)
+{
+	struct dev_addr_list *da;
+
+	netif_tx_lock_bh(from);
+	netif_tx_lock_bh(to);
+
+	for (da = from->mc_list; da != NULL; da = da->next) {
+		if (!da->da_synced)
+			continue;
+		__dev_addr_delete(&to->mc_list, &to->mc_count,
+				  da->da_addr, da->da_addrlen, 0);
+		da->da_synced = 0;
+		__dev_addr_delete(&from->mc_list, &from->mc_count,
+				  da->da_addr, da->da_addrlen, 0);
+	}
+	__dev_set_rx_mode(to);
+
+	netif_tx_unlock_bh(to);
+	netif_tx_unlock_bh(from);
+}
+EXPORT_SYMBOL(dev_mc_unsync);
+
 /*
  *	Discard multicast list when a device is downed
  */
-- 
cgit v1.3-8-gc7d7


From 6c78dcbd47a68a7d25d2bee7a6c74b9136cb5fde Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sat, 14 Jul 2007 18:52:56 -0700
Subject: [VLAN]: Fix promiscous/allmulti synchronization races

The set_multicast_list function may be called without holding the rtnl
mutex, resulting in races when changing the underlying device's promiscous
and allmulti state. Use the change_rx_mode hook, which is always invoked
under the rtnl.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h |  2 --
 net/8021q/vlan.c        |  1 +
 net/8021q/vlan.h        |  1 +
 net/8021q/vlan_dev.c    | 38 ++++++++++++++++++++------------------
 4 files changed, 22 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 61a57dc2ac99..7f71df4c952f 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -132,8 +132,6 @@ struct vlan_dev_info {
                                            * made, in order to feed the right changes down
                                            * to the real hardware...
                                            */
-	int old_allmulti;               /* similar to above. */
-	int old_promiscuity;            /* similar to above. */
 	struct net_device *real_dev;    /* the underlying device/interface */
 	unsigned char real_dev_addr[ETH_ALEN];
 	struct proc_dir_entry *dent;    /* Holds the proc data */
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index abb9900edb3f..39bdcc25c150 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -373,6 +373,7 @@ void vlan_setup(struct net_device *new_dev)
 	new_dev->open = vlan_dev_open;
 	new_dev->stop = vlan_dev_stop;
 	new_dev->set_multicast_list = vlan_dev_set_multicast_list;
+	new_dev->change_rx_flags = vlan_change_rx_flags;
 	new_dev->destructor = free_netdev;
 	new_dev->do_ioctl = vlan_dev_ioctl;
 
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 62ce1c519aab..7df5b2935579 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -69,6 +69,7 @@ int vlan_dev_set_vlan_flag(const struct net_device *dev,
 			   u32 flag, short flag_val);
 void vlan_dev_get_realdev_name(const struct net_device *dev, char *result);
 void vlan_dev_get_vid(const struct net_device *dev, unsigned short *result);
+void vlan_change_rx_flags(struct net_device *dev, int change);
 void vlan_dev_set_multicast_list(struct net_device *vlan_dev);
 
 int vlan_check_real_dev(struct net_device *real_dev, unsigned short vlan_id);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index d4a62d1b52b4..dec7e62b2e10 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -712,6 +712,11 @@ int vlan_dev_open(struct net_device *dev)
 	}
 	memcpy(vlan->real_dev_addr, real_dev->dev_addr, ETH_ALEN);
 
+	if (dev->flags & IFF_ALLMULTI)
+		dev_set_allmulti(real_dev, 1);
+	if (dev->flags & IFF_PROMISC)
+		dev_set_promiscuity(real_dev, 1);
+
 	return 0;
 }
 
@@ -721,6 +726,11 @@ int vlan_dev_stop(struct net_device *dev)
 
 	vlan_flush_mc_list(dev);
 
+	if (dev->flags & IFF_ALLMULTI)
+		dev_set_allmulti(real_dev, -1);
+	if (dev->flags & IFF_PROMISC)
+		dev_set_promiscuity(real_dev, -1);
+
 	if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr))
 		dev_unicast_delete(real_dev, dev->dev_addr, dev->addr_len);
 
@@ -754,34 +764,26 @@ int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	return err;
 }
 
+void vlan_change_rx_flags(struct net_device *dev, int change)
+{
+	struct net_device *real_dev = VLAN_DEV_INFO(dev)->real_dev;
+
+	if (change & IFF_ALLMULTI)
+		dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1);
+	if (change & IFF_PROMISC)
+		dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1);
+}
+
 /** Taken from Gleb + Lennert's VLAN code, and modified... */
 void vlan_dev_set_multicast_list(struct net_device *vlan_dev)
 {
 	struct dev_mc_list *dmi;
 	struct net_device *real_dev;
-	int inc;
 
 	if (vlan_dev && (vlan_dev->priv_flags & IFF_802_1Q_VLAN)) {
 		/* Then it's a real vlan device, as far as we can tell.. */
 		real_dev = VLAN_DEV_INFO(vlan_dev)->real_dev;
 
-		/* compare the current promiscuity to the last promisc we had.. */
-		inc = vlan_dev->promiscuity - VLAN_DEV_INFO(vlan_dev)->old_promiscuity;
-		if (inc) {
-			printk(KERN_INFO "%s: dev_set_promiscuity(master, %d)\n",
-			       vlan_dev->name, inc);
-			dev_set_promiscuity(real_dev, inc); /* found in dev.c */
-			VLAN_DEV_INFO(vlan_dev)->old_promiscuity = vlan_dev->promiscuity;
-		}
-
-		inc = vlan_dev->allmulti - VLAN_DEV_INFO(vlan_dev)->old_allmulti;
-		if (inc) {
-			printk(KERN_INFO "%s: dev_set_allmulti(master, %d)\n",
-			       vlan_dev->name, inc);
-			dev_set_allmulti(real_dev, inc); /* dev.c */
-			VLAN_DEV_INFO(vlan_dev)->old_allmulti = vlan_dev->allmulti;
-		}
-
 		/* looking for addresses to add to master's list */
 		for (dmi = vlan_dev->mc_list; dmi != NULL; dmi = dmi->next) {
 			if (vlan_should_add_mc(dmi, VLAN_DEV_INFO(vlan_dev)->old_mc_list)) {
-- 
cgit v1.3-8-gc7d7


From 56addd6eeeb4e11f5a0af7093ca078e0f29140e0 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sat, 14 Jul 2007 18:53:28 -0700
Subject: [VLAN]: Use multicast list synchronization helpers

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h |   5 --
 net/8021q/vlan_dev.c    | 131 +-----------------------------------------------
 2 files changed, 2 insertions(+), 134 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 7f71df4c952f..f8443fdb124a 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -127,11 +127,6 @@ struct vlan_dev_info {
                                         *   like DHCP that use packet-filtering and don't understand
                                         *   802.1Q
                                         */
-	struct dev_mc_list *old_mc_list;  /* old multi-cast list for the VLAN interface..
-                                           * we save this so we can tell what changes were
-                                           * made, in order to feed the right changes down
-                                           * to the real hardware...
-                                           */
 	struct net_device *real_dev;    /* the underlying device/interface */
 	unsigned char real_dev_addr[ETH_ALEN];
 	struct proc_dir_entry *dent;    /* Holds the proc data */
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index dec7e62b2e10..4d2aa4dd42ac 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -612,90 +612,6 @@ void vlan_dev_get_vid(const struct net_device *dev, unsigned short *result)
 	*result = VLAN_DEV_INFO(dev)->vlan_id;
 }
 
-static inline int vlan_dmi_equals(struct dev_mc_list *dmi1,
-				  struct dev_mc_list *dmi2)
-{
-	return ((dmi1->dmi_addrlen == dmi2->dmi_addrlen) &&
-		(memcmp(dmi1->dmi_addr, dmi2->dmi_addr, dmi1->dmi_addrlen) == 0));
-}
-
-/** dmi is a single entry into a dev_mc_list, a single node.  mc_list is
- *  an entire list, and we'll iterate through it.
- */
-static int vlan_should_add_mc(struct dev_mc_list *dmi, struct dev_mc_list *mc_list)
-{
-	struct dev_mc_list *idmi;
-
-	for (idmi = mc_list; idmi != NULL; ) {
-		if (vlan_dmi_equals(dmi, idmi)) {
-			if (dmi->dmi_users > idmi->dmi_users)
-				return 1;
-			else
-				return 0;
-		} else {
-			idmi = idmi->next;
-		}
-	}
-
-	return 1;
-}
-
-static inline void vlan_destroy_mc_list(struct dev_mc_list *mc_list)
-{
-	struct dev_mc_list *dmi = mc_list;
-	struct dev_mc_list *next;
-
-	while(dmi) {
-		next = dmi->next;
-		kfree(dmi);
-		dmi = next;
-	}
-}
-
-static void vlan_copy_mc_list(struct dev_mc_list *mc_list, struct vlan_dev_info *vlan_info)
-{
-	struct dev_mc_list *dmi, *new_dmi;
-
-	vlan_destroy_mc_list(vlan_info->old_mc_list);
-	vlan_info->old_mc_list = NULL;
-
-	for (dmi = mc_list; dmi != NULL; dmi = dmi->next) {
-		new_dmi = kmalloc(sizeof(*new_dmi), GFP_ATOMIC);
-		if (new_dmi == NULL) {
-			printk(KERN_ERR "vlan: cannot allocate memory. "
-			       "Multicast may not work properly from now.\n");
-			return;
-		}
-
-		/* Copy whole structure, then make new 'next' pointer */
-		*new_dmi = *dmi;
-		new_dmi->next = vlan_info->old_mc_list;
-		vlan_info->old_mc_list = new_dmi;
-	}
-}
-
-static void vlan_flush_mc_list(struct net_device *dev)
-{
-	struct dev_mc_list *dmi = dev->mc_list;
-
-	while (dmi) {
-		printk(KERN_DEBUG "%s: del %.2x:%.2x:%.2x:%.2x:%.2x:%.2x mcast address from vlan interface\n",
-		       dev->name,
-		       dmi->dmi_addr[0],
-		       dmi->dmi_addr[1],
-		       dmi->dmi_addr[2],
-		       dmi->dmi_addr[3],
-		       dmi->dmi_addr[4],
-		       dmi->dmi_addr[5]);
-		dev_mc_delete(dev, dmi->dmi_addr, dmi->dmi_addrlen, 0);
-		dmi = dev->mc_list;
-	}
-
-	/* dev->mc_list is NULL by the time we get here. */
-	vlan_destroy_mc_list(VLAN_DEV_INFO(dev)->old_mc_list);
-	VLAN_DEV_INFO(dev)->old_mc_list = NULL;
-}
-
 int vlan_dev_open(struct net_device *dev)
 {
 	struct vlan_dev_info *vlan = VLAN_DEV_INFO(dev);
@@ -724,8 +640,7 @@ int vlan_dev_stop(struct net_device *dev)
 {
 	struct net_device *real_dev = VLAN_DEV_INFO(dev)->real_dev;
 
-	vlan_flush_mc_list(dev);
-
+	dev_mc_unsync(real_dev, dev);
 	if (dev->flags & IFF_ALLMULTI)
 		dev_set_allmulti(real_dev, -1);
 	if (dev->flags & IFF_PROMISC)
@@ -777,47 +692,5 @@ void vlan_change_rx_flags(struct net_device *dev, int change)
 /** Taken from Gleb + Lennert's VLAN code, and modified... */
 void vlan_dev_set_multicast_list(struct net_device *vlan_dev)
 {
-	struct dev_mc_list *dmi;
-	struct net_device *real_dev;
-
-	if (vlan_dev && (vlan_dev->priv_flags & IFF_802_1Q_VLAN)) {
-		/* Then it's a real vlan device, as far as we can tell.. */
-		real_dev = VLAN_DEV_INFO(vlan_dev)->real_dev;
-
-		/* looking for addresses to add to master's list */
-		for (dmi = vlan_dev->mc_list; dmi != NULL; dmi = dmi->next) {
-			if (vlan_should_add_mc(dmi, VLAN_DEV_INFO(vlan_dev)->old_mc_list)) {
-				dev_mc_add(real_dev, dmi->dmi_addr, dmi->dmi_addrlen, 0);
-				printk(KERN_DEBUG "%s: add %.2x:%.2x:%.2x:%.2x:%.2x:%.2x mcast address to master interface\n",
-				       vlan_dev->name,
-				       dmi->dmi_addr[0],
-				       dmi->dmi_addr[1],
-				       dmi->dmi_addr[2],
-				       dmi->dmi_addr[3],
-				       dmi->dmi_addr[4],
-				       dmi->dmi_addr[5]);
-			}
-		}
-
-		/* looking for addresses to delete from master's list */
-		for (dmi = VLAN_DEV_INFO(vlan_dev)->old_mc_list; dmi != NULL; dmi = dmi->next) {
-			if (vlan_should_add_mc(dmi, vlan_dev->mc_list)) {
-				/* if we think we should add it to the new list, then we should really
-				 * delete it from the real list on the underlying device.
-				 */
-				dev_mc_delete(real_dev, dmi->dmi_addr, dmi->dmi_addrlen, 0);
-				printk(KERN_DEBUG "%s: del %.2x:%.2x:%.2x:%.2x:%.2x:%.2x mcast address from master interface\n",
-				       vlan_dev->name,
-				       dmi->dmi_addr[0],
-				       dmi->dmi_addr[1],
-				       dmi->dmi_addr[2],
-				       dmi->dmi_addr[3],
-				       dmi->dmi_addr[4],
-				       dmi->dmi_addr[5]);
-			}
-		}
-
-		/* save multicast list */
-		vlan_copy_mc_list(vlan_dev->mc_list, VLAN_DEV_INFO(vlan_dev));
-	}
+	dev_mc_sync(VLAN_DEV_INFO(vlan_dev)->real_dev, vlan_dev);
 }
-- 
cgit v1.3-8-gc7d7


From b863ceb7ddcea8c55fcf1d7b2ac591d50aa7ed53 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sat, 14 Jul 2007 18:55:06 -0700
Subject: [NET]: Add macvlan driver

Add macvlan driver, which allows to create virtual ethernet devices
based on MAC address.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                |   6 +
 drivers/net/Kconfig        |  10 +
 drivers/net/Makefile       |   1 +
 drivers/net/macvlan.c      | 496 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/if_macvlan.h |   9 +
 include/linux/netdevice.h  |   2 +
 net/core/dev.c             |  26 +++
 7 files changed, 550 insertions(+)
 create mode 100644 drivers/net/macvlan.c
 create mode 100644 include/linux/if_macvlan.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 845fbf4478b3..360eb581953b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2330,6 +2330,12 @@ W:	http://linuxwireless.org/
 T:	git kernel.org:/pub/scm/linux/kernel/git/jbenc/mac80211.git
 S:	Maintained
 
+MACVLAN DRIVER
+P:	Patrick McHardy
+M:	kaber@trash.net
+L:	netdev@vger.kernel.org
+S:	Maintained
+
 MARVELL YUKON / SYSKONNECT DRIVER
 P:	Mirko Lindner
 M: 	mlindner@syskonnect.de
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index ba314adf68b8..d17d64eb7065 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -82,6 +82,16 @@ config BONDING
 	  To compile this driver as a module, choose M here: the module
 	  will be called bonding.
 
+config MACVLAN
+	tristate "MAC-VLAN support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	---help---
+	  This allows one to create virtual interfaces that map packets to
+	  or from specific MAC addresses to a particular interface.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called macvlan.
+
 config EQUALIZER
 	tristate "EQL (serial line load balancing) support"
 	---help---
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index a2241e6e1457..c26b8674213c 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -128,6 +128,7 @@ obj-$(CONFIG_SLHC) += slhc.o
 
 obj-$(CONFIG_DUMMY) += dummy.o
 obj-$(CONFIG_IFB) += ifb.o
+obj-$(CONFIG_MACVLAN) += macvlan.o
 obj-$(CONFIG_DE600) += de600.o
 obj-$(CONFIG_DE620) += de620.o
 obj-$(CONFIG_LANCE) += lance.o
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
new file mode 100644
index 000000000000..dc74d006e01f
--- /dev/null
+++ b/drivers/net/macvlan.c
@@ -0,0 +1,496 @@
+/*
+ * Copyright (c) 2007 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * The code this is based on carried the following copyright notice:
+ * ---
+ * (C) Copyright 2001-2006
+ * Alex Zeffertt, Cambridge Broadband Ltd, ajz@cambridgebroadband.com
+ * Re-worked by Ben Greear <greearb@candelatech.com>
+ * ---
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/if_arp.h>
+#include <linux/if_link.h>
+#include <linux/if_macvlan.h>
+#include <net/rtnetlink.h>
+
+#define MACVLAN_HASH_SIZE	(1 << BITS_PER_BYTE)
+
+struct macvlan_port {
+	struct net_device	*dev;
+	struct hlist_head	vlan_hash[MACVLAN_HASH_SIZE];
+	struct list_head	vlans;
+};
+
+struct macvlan_dev {
+	struct net_device	*dev;
+	struct list_head	list;
+	struct hlist_node	hlist;
+	struct macvlan_port	*port;
+	struct net_device	*lowerdev;
+};
+
+
+static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port,
+					       const unsigned char *addr)
+{
+	struct macvlan_dev *vlan;
+	struct hlist_node *n;
+
+	hlist_for_each_entry_rcu(vlan, n, &port->vlan_hash[addr[5]], hlist) {
+		if (!compare_ether_addr(vlan->dev->dev_addr, addr))
+			return vlan;
+	}
+	return NULL;
+}
+
+static void macvlan_broadcast(struct sk_buff *skb,
+			      const struct macvlan_port *port)
+{
+	const struct ethhdr *eth = eth_hdr(skb);
+	const struct macvlan_dev *vlan;
+	struct hlist_node *n;
+	struct net_device *dev;
+	struct sk_buff *nskb;
+	unsigned int i;
+
+	for (i = 0; i < MACVLAN_HASH_SIZE; i++) {
+		hlist_for_each_entry_rcu(vlan, n, &port->vlan_hash[i], hlist) {
+			dev = vlan->dev;
+			if (unlikely(!(dev->flags & IFF_UP)))
+				continue;
+
+			nskb = skb_clone(skb, GFP_ATOMIC);
+			if (nskb == NULL) {
+				dev->stats.rx_errors++;
+				dev->stats.rx_dropped++;
+				continue;
+			}
+
+			dev->stats.rx_bytes += skb->len + ETH_HLEN;
+			dev->stats.rx_packets++;
+			dev->stats.multicast++;
+			dev->last_rx = jiffies;
+
+			nskb->dev = dev;
+			if (!compare_ether_addr(eth->h_dest, dev->broadcast))
+				nskb->pkt_type = PACKET_BROADCAST;
+			else
+				nskb->pkt_type = PACKET_MULTICAST;
+
+			netif_rx(nskb);
+		}
+	}
+}
+
+/* called under rcu_read_lock() from netif_receive_skb */
+static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb)
+{
+	const struct ethhdr *eth = eth_hdr(skb);
+	const struct macvlan_port *port;
+	const struct macvlan_dev *vlan;
+	struct net_device *dev;
+
+	port = rcu_dereference(skb->dev->macvlan_port);
+	if (port == NULL)
+		return skb;
+
+	if (is_multicast_ether_addr(eth->h_dest)) {
+		macvlan_broadcast(skb, port);
+		return skb;
+	}
+
+	vlan = macvlan_hash_lookup(port, eth->h_dest);
+	if (vlan == NULL)
+		return skb;
+
+	dev = vlan->dev;
+	if (unlikely(!(dev->flags & IFF_UP))) {
+		kfree_skb(skb);
+		return NULL;
+	}
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (skb == NULL) {
+		dev->stats.rx_errors++;
+		dev->stats.rx_dropped++;
+		return NULL;
+	}
+
+	dev->stats.rx_bytes += skb->len + ETH_HLEN;
+	dev->stats.rx_packets++;
+	dev->last_rx = jiffies;
+
+	skb->dev = dev;
+	skb->pkt_type = PACKET_HOST;
+
+	netif_rx(skb);
+	return NULL;
+}
+
+static int macvlan_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	const struct macvlan_dev *vlan = netdev_priv(dev);
+	unsigned int len = skb->len;
+	int ret;
+
+	skb->dev = vlan->lowerdev;
+	ret = dev_queue_xmit(skb);
+
+	if (likely(ret == NET_XMIT_SUCCESS)) {
+		dev->stats.tx_packets++;
+		dev->stats.tx_bytes += len;
+	} else {
+		dev->stats.tx_errors++;
+		dev->stats.tx_aborted_errors++;
+	}
+	return NETDEV_TX_OK;
+}
+
+static int macvlan_hard_header(struct sk_buff *skb, struct net_device *dev,
+			       unsigned short type, void *daddr, void *saddr,
+			       unsigned len)
+{
+	const struct macvlan_dev *vlan = netdev_priv(dev);
+	struct net_device *lowerdev = vlan->lowerdev;
+
+	return lowerdev->hard_header(skb, lowerdev, type, daddr,
+				     saddr ? : dev->dev_addr, len);
+}
+
+static int macvlan_open(struct net_device *dev)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct macvlan_port *port = vlan->port;
+	struct net_device *lowerdev = vlan->lowerdev;
+	int err;
+
+	err = dev_unicast_add(lowerdev, dev->dev_addr, ETH_ALEN);
+	if (err < 0)
+		return err;
+	if (dev->flags & IFF_ALLMULTI)
+		dev_set_allmulti(lowerdev, 1);
+
+	hlist_add_head_rcu(&vlan->hlist, &port->vlan_hash[dev->dev_addr[5]]);
+	return 0;
+}
+
+static int macvlan_stop(struct net_device *dev)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct net_device *lowerdev = vlan->lowerdev;
+
+	dev_mc_unsync(lowerdev, dev);
+	if (dev->flags & IFF_ALLMULTI)
+		dev_set_allmulti(lowerdev, -1);
+
+	dev_unicast_delete(lowerdev, dev->dev_addr, ETH_ALEN);
+
+	hlist_del_rcu(&vlan->hlist);
+	synchronize_rcu();
+	return 0;
+}
+
+static void macvlan_change_rx_flags(struct net_device *dev, int change)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct net_device *lowerdev = vlan->lowerdev;
+
+	if (change & IFF_ALLMULTI)
+		dev_set_allmulti(lowerdev, dev->flags & IFF_ALLMULTI ? 1 : -1);
+}
+
+static void macvlan_set_multicast_list(struct net_device *dev)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+
+	dev_mc_sync(vlan->lowerdev, dev);
+}
+
+static int macvlan_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+
+	if (new_mtu < 68 || vlan->lowerdev->mtu < new_mtu)
+		return -EINVAL;
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+/*
+ * macvlan network devices have devices nesting below it and are a special
+ * "super class" of normal network devices; split their locks off into a
+ * separate class since they always nest.
+ */
+static struct lock_class_key macvlan_netdev_xmit_lock_key;
+
+#define MACVLAN_FEATURES \
+	(NETIF_F_SG | NETIF_F_ALL_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \
+	 NETIF_F_GSO | NETIF_F_TSO | NETIF_F_UFO | NETIF_F_GSO_ROBUST | \
+	 NETIF_F_TSO_ECN | NETIF_F_TSO6)
+
+#define MACVLAN_STATE_MASK \
+	((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT))
+
+static int macvlan_init(struct net_device *dev)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	const struct net_device *lowerdev = vlan->lowerdev;
+
+	dev->state		= (dev->state & ~MACVLAN_STATE_MASK) |
+				  (lowerdev->state & MACVLAN_STATE_MASK);
+	dev->features 		= lowerdev->features & MACVLAN_FEATURES;
+	dev->iflink		= lowerdev->ifindex;
+
+	lockdep_set_class(&dev->_xmit_lock, &macvlan_netdev_xmit_lock_key);
+	return 0;
+}
+
+static void macvlan_ethtool_get_drvinfo(struct net_device *dev,
+					struct ethtool_drvinfo *drvinfo)
+{
+	snprintf(drvinfo->driver, 32, "macvlan");
+	snprintf(drvinfo->version, 32, "0.1");
+}
+
+static u32 macvlan_ethtool_get_rx_csum(struct net_device *dev)
+{
+	const struct macvlan_dev *vlan = netdev_priv(dev);
+	struct net_device *lowerdev = vlan->lowerdev;
+
+	if (lowerdev->ethtool_ops->get_rx_csum == NULL)
+		return 0;
+	return lowerdev->ethtool_ops->get_rx_csum(lowerdev);
+}
+
+static const struct ethtool_ops macvlan_ethtool_ops = {
+	.get_link		= ethtool_op_get_link,
+	.get_rx_csum		= macvlan_ethtool_get_rx_csum,
+	.get_tx_csum		= ethtool_op_get_tx_csum,
+	.get_tso		= ethtool_op_get_tso,
+	.get_ufo		= ethtool_op_get_ufo,
+	.get_sg			= ethtool_op_get_sg,
+	.get_drvinfo		= macvlan_ethtool_get_drvinfo,
+};
+
+static void macvlan_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+
+	dev->init		= macvlan_init;
+	dev->open		= macvlan_open;
+	dev->stop		= macvlan_stop;
+	dev->change_mtu		= macvlan_change_mtu;
+	dev->change_rx_flags	= macvlan_change_rx_flags;
+	dev->set_multicast_list	= macvlan_set_multicast_list;
+	dev->hard_header	= macvlan_hard_header;
+	dev->hard_start_xmit	= macvlan_hard_start_xmit;
+	dev->destructor		= free_netdev;
+	dev->ethtool_ops	= &macvlan_ethtool_ops;
+	dev->tx_queue_len	= 0;
+}
+
+static int macvlan_port_create(struct net_device *dev)
+{
+	struct macvlan_port *port;
+	unsigned int i;
+
+	if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK)
+		return -EINVAL;
+
+	port = kzalloc(sizeof(*port), GFP_KERNEL);
+	if (port == NULL)
+		return -ENOMEM;
+
+	port->dev = dev;
+	INIT_LIST_HEAD(&port->vlans);
+	for (i = 0; i < MACVLAN_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&port->vlan_hash[i]);
+	rcu_assign_pointer(dev->macvlan_port, port);
+	return 0;
+}
+
+static void macvlan_port_destroy(struct net_device *dev)
+{
+	struct macvlan_port *port = dev->macvlan_port;
+
+	rcu_assign_pointer(dev->macvlan_port, NULL);
+	synchronize_rcu();
+	kfree(port);
+}
+
+static void macvlan_transfer_operstate(struct net_device *dev)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	const struct net_device *lowerdev = vlan->lowerdev;
+
+	if (lowerdev->operstate == IF_OPER_DORMANT)
+		netif_dormant_on(dev);
+	else
+		netif_dormant_off(dev);
+
+	if (netif_carrier_ok(lowerdev)) {
+		if (!netif_carrier_ok(dev))
+			netif_carrier_on(dev);
+	} else {
+		if (netif_carrier_ok(lowerdev))
+			netif_carrier_off(dev);
+	}
+}
+
+static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	if (tb[IFLA_ADDRESS]) {
+		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+			return -EINVAL;
+		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+			return -EADDRNOTAVAIL;
+	}
+	return 0;
+}
+
+static int macvlan_newlink(struct net_device *dev,
+			   struct nlattr *tb[], struct nlattr *data[])
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct macvlan_port *port;
+	struct net_device *lowerdev;
+	int err;
+
+	if (!tb[IFLA_LINK])
+		return -EINVAL;
+
+	lowerdev = __dev_get_by_index(nla_get_u32(tb[IFLA_LINK]));
+	if (lowerdev == NULL)
+		return -ENODEV;
+
+	if (!tb[IFLA_MTU])
+		dev->mtu = lowerdev->mtu;
+	else if (dev->mtu > lowerdev->mtu)
+		return -EINVAL;
+
+	if (!tb[IFLA_ADDRESS])
+		random_ether_addr(dev->dev_addr);
+
+	if (lowerdev->macvlan_port == NULL) {
+		err = macvlan_port_create(lowerdev);
+		if (err < 0)
+			return err;
+	}
+	port = lowerdev->macvlan_port;
+
+	vlan->lowerdev = lowerdev;
+	vlan->dev      = dev;
+	vlan->port     = port;
+
+	err = register_netdevice(dev);
+	if (err < 0)
+		return err;
+
+	list_add_tail(&vlan->list, &port->vlans);
+	macvlan_transfer_operstate(dev);
+	return 0;
+}
+
+static void macvlan_dellink(struct net_device *dev)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct macvlan_port *port = vlan->port;
+
+	list_del(&vlan->list);
+	unregister_netdevice(dev);
+
+	if (list_empty(&port->vlans))
+		macvlan_port_destroy(dev);
+}
+
+static struct rtnl_link_ops macvlan_link_ops __read_mostly = {
+	.kind		= "macvlan",
+	.priv_size	= sizeof(struct macvlan_dev),
+	.setup		= macvlan_setup,
+	.validate	= macvlan_validate,
+	.newlink	= macvlan_newlink,
+	.dellink	= macvlan_dellink,
+};
+
+static int macvlan_device_event(struct notifier_block *unused,
+				unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct macvlan_dev *vlan, *next;
+	struct macvlan_port *port;
+
+	port = dev->macvlan_port;
+	if (port == NULL)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_CHANGE:
+		list_for_each_entry(vlan, &port->vlans, list)
+			macvlan_transfer_operstate(vlan->dev);
+		break;
+	case NETDEV_FEAT_CHANGE:
+		list_for_each_entry(vlan, &port->vlans, list) {
+			vlan->dev->features = dev->features & MACVLAN_FEATURES;
+			netdev_features_change(vlan->dev);
+		}
+		break;
+	case NETDEV_UNREGISTER:
+		list_for_each_entry_safe(vlan, next, &port->vlans, list)
+			macvlan_dellink(vlan->dev);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block macvlan_notifier_block __read_mostly = {
+	.notifier_call	= macvlan_device_event,
+};
+
+static int __init macvlan_init_module(void)
+{
+	int err;
+
+	register_netdevice_notifier(&macvlan_notifier_block);
+	macvlan_handle_frame_hook = macvlan_handle_frame;
+
+	err = rtnl_link_register(&macvlan_link_ops);
+	if (err < 0)
+		goto err1;
+	return 0;
+err1:
+	macvlan_handle_frame_hook = macvlan_handle_frame;
+	unregister_netdevice_notifier(&macvlan_notifier_block);
+	return err;
+}
+
+static void __exit macvlan_cleanup_module(void)
+{
+	rtnl_link_unregister(&macvlan_link_ops);
+	macvlan_handle_frame_hook = NULL;
+	unregister_netdevice_notifier(&macvlan_notifier_block);
+}
+
+module_init(macvlan_init_module);
+module_exit(macvlan_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Driver for MAC address based VLANs");
+MODULE_ALIAS_RTNL_LINK("macvlan");
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
new file mode 100644
index 000000000000..0d9d7ea2c1cc
--- /dev/null
+++ b/include/linux/if_macvlan.h
@@ -0,0 +1,9 @@
+#ifndef _LINUX_IF_MACVLAN_H
+#define _LINUX_IF_MACVLAN_H
+
+#ifdef __KERNEL__
+
+extern struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *);
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_IF_MACVLAN_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e5af458ab04b..322b5eae57dd 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -564,6 +564,8 @@ struct net_device
 
 	/* bridge stuff */
 	struct net_bridge_port	*br_port;
+	/* macvlan */
+	struct macvlan_port	*macvlan_port;
 
 	/* class/net/name entry */
 	struct device		dev;
diff --git a/net/core/dev.c b/net/core/dev.c
index 59ec811d2b54..13a0d9f6da54 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -98,6 +98,7 @@
 #include <linux/seq_file.h>
 #include <linux/stat.h>
 #include <linux/if_bridge.h>
+#include <linux/if_macvlan.h>
 #include <net/dst.h>
 #include <net/pkt_sched.h>
 #include <net/checksum.h>
@@ -1813,6 +1814,28 @@ static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
 #define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
 #endif
 
+#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
+struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
+EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
+
+static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
+					     struct packet_type **pt_prev,
+					     int *ret,
+					     struct net_device *orig_dev)
+{
+	if (skb->dev->macvlan_port == NULL)
+		return skb;
+
+	if (*pt_prev) {
+		*ret = deliver_skb(skb, *pt_prev, orig_dev);
+		*pt_prev = NULL;
+	}
+	return macvlan_handle_frame_hook(skb);
+}
+#else
+#define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
+#endif
+
 #ifdef CONFIG_NET_CLS_ACT
 /* TODO: Maybe we should just force sch_ingress to be compiled in
  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
@@ -1918,6 +1941,9 @@ ncls:
 #endif
 
 	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
+	if (!skb)
+		goto out;
+	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
 	if (!skb)
 		goto out;
 
-- 
cgit v1.3-8-gc7d7


From 6460d948f3ebf7d5040328a60a0ab7221f69945b Mon Sep 17 00:00:00 2001
From: Michael Chan <mchan@broadcom.com>
Date: Sat, 14 Jul 2007 19:07:52 -0700
Subject: [NET]: Add ethtool support for NETIF_F_IPV6_CSUM devices.

Add ethtool utility function to set or clear IPV6_CSUM feature flag.
Modify tg3.c and bnx2.c to use this function when doing ethtool -K
to change tx checksum.

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bnx2.c      |  2 +-
 drivers/net/tg3.c       |  2 +-
 include/linux/ethtool.h |  1 +
 net/core/ethtool.c      | 12 ++++++++++++
 4 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 4e5e1cb2adc1..d23861c8658c 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -6218,7 +6218,7 @@ bnx2_set_tx_csum(struct net_device *dev, u32 data)
 	struct bnx2 *bp = netdev_priv(dev);
 
 	if (CHIP_NUM(bp) == CHIP_NUM_5709)
-		return (ethtool_op_set_tx_hw_csum(dev, data));
+		return (ethtool_op_set_tx_ipv6_csum(dev, data));
 	else
 		return (ethtool_op_set_tx_csum(dev, data));
 }
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 32e4037dcb50..5ee14764fd74 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -8318,7 +8318,7 @@ static int tg3_set_tx_csum(struct net_device *dev, u32 data)
 
 	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5755 ||
 	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787)
-		ethtool_op_set_tx_hw_csum(dev, data);
+		ethtool_op_set_tx_ipv6_csum(dev, data);
 	else
 		ethtool_op_set_tx_csum(dev, data);
 
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index f2d248f8cc92..3a632244f31b 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -265,6 +265,7 @@ u32 ethtool_op_get_link(struct net_device *dev);
 u32 ethtool_op_get_tx_csum(struct net_device *dev);
 int ethtool_op_set_tx_csum(struct net_device *dev, u32 data);
 int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data);
+int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data);
 u32 ethtool_op_get_sg(struct net_device *dev);
 int ethtool_op_set_sg(struct net_device *dev, u32 data);
 u32 ethtool_op_get_tso(struct net_device *dev);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 8d5e5a09b576..0b531e98ec33 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -52,6 +52,17 @@ int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data)
 
 	return 0;
 }
+
+int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data)
+{
+	if (data)
+		dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+	else
+		dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
+
+	return 0;
+}
+
 u32 ethtool_op_get_sg(struct net_device *dev)
 {
 	return (dev->features & NETIF_F_SG) != 0;
@@ -980,5 +991,6 @@ EXPORT_SYMBOL(ethtool_op_set_sg);
 EXPORT_SYMBOL(ethtool_op_set_tso);
 EXPORT_SYMBOL(ethtool_op_set_tx_csum);
 EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum);
+EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum);
 EXPORT_SYMBOL(ethtool_op_set_ufo);
 EXPORT_SYMBOL(ethtool_op_get_ufo);
-- 
cgit v1.3-8-gc7d7


From 370786f9cfd430cb424f00ce4110e75bb1b95a19 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@gmx.de>
Date: Sat, 14 Jul 2007 20:47:26 -0700
Subject: [NETFILTER]: x_tables: add connlimit match

ipt_connlimit has been sitting in POM-NG for a long time.
Here is a new shiny xt_connlimit with:

 * xtables'ified
 * will request the layer3 module
   (previously it hotdropped every packet when it was not loaded)
 * fixed: there was a deadlock in case of an OOM condition
 * support for any layer4 protocol (e.g. UDP/SCTP)
 * using jhash, as suggested by Eric Dumazet
 * ipv6 support

Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/xt_connlimit.h |  17 ++
 net/netfilter/Kconfig                  |   7 +
 net/netfilter/Makefile                 |   1 +
 net/netfilter/xt_connlimit.c           | 313 +++++++++++++++++++++++++++++++++
 4 files changed, 338 insertions(+)
 create mode 100644 include/linux/netfilter/xt_connlimit.h
 create mode 100644 net/netfilter/xt_connlimit.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter/xt_connlimit.h b/include/linux/netfilter/xt_connlimit.h
new file mode 100644
index 000000000000..90ae8b474cb8
--- /dev/null
+++ b/include/linux/netfilter/xt_connlimit.h
@@ -0,0 +1,17 @@
+#ifndef _XT_CONNLIMIT_H
+#define _XT_CONNLIMIT_H
+
+struct xt_connlimit_data;
+
+struct xt_connlimit_info {
+	union {
+		u_int32_t v4_mask;
+		u_int32_t v6_mask[4];
+	};
+	unsigned int limit, inverse;
+
+	/* this needs to be at the end */
+	struct xt_connlimit_data *data __attribute__((aligned(8)));
+};
+
+#endif /* _XT_CONNLIMIT_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index df5e8dab871d..9415b9a5dba6 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -423,6 +423,13 @@ config NETFILTER_XT_MATCH_CONNBYTES
 	  If you want to compile it as a module, say M here and read
 	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
 
+config NETFILTER_XT_MATCH_CONNLIMIT
+	tristate '"connlimit" match support"'
+	depends on NETFILTER_XTABLES
+	---help---
+	  This match allows you to match against the number of parallel
+	  connections to a server per client IP address (or address block).
+
 config NETFILTER_XT_MATCH_CONNMARK
 	tristate  '"connmark" connection mark match support'
 	depends on NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 58b4245a1723..3e4a16aeb04e 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
 # matches
 obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_CONNMARK) += xt_connmark.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
new file mode 100644
index 000000000000..3335dd5be962
--- /dev/null
+++ b/net/netfilter/xt_connlimit.c
@@ -0,0 +1,313 @@
+/*
+ * netfilter module to limit the number of parallel tcp
+ * connections per IP address.
+ *   (c) 2000 Gerd Knorr <kraxel@bytesex.org>
+ *   Nov 2002: Martin Bene <martin.bene@icomedias.com>:
+ *		only ignore TIME_WAIT or gone connections
+ *   Copyright © Jan Engelhardt <jengelh@gmx.de>, 2007
+ *
+ * based on ...
+ *
+ * Kernel module to match connection tracking information.
+ * GPL (C) 1999  Rusty Russell (rusty@rustcorp.com.au).
+ */
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/jhash.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/nf_conntrack_tcp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_connlimit.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+
+/* we will save the tuples of all connections we care about */
+struct xt_connlimit_conn {
+	struct list_head list;
+	struct nf_conntrack_tuple tuple;
+};
+
+struct xt_connlimit_data {
+	struct list_head iphash[256];
+	spinlock_t lock;
+};
+
+static u_int32_t connlimit_rnd;
+static bool connlimit_rnd_inited;
+
+static inline unsigned int connlimit_iphash(u_int32_t addr)
+{
+	if (unlikely(!connlimit_rnd_inited)) {
+		get_random_bytes(&connlimit_rnd, sizeof(connlimit_rnd));
+		connlimit_rnd_inited = true;
+	}
+	return jhash_1word(addr, connlimit_rnd) & 0xFF;
+}
+
+static inline unsigned int
+connlimit_iphash6(const union nf_conntrack_address *addr,
+		  const union nf_conntrack_address *mask)
+{
+	union nf_conntrack_address res;
+	unsigned int i;
+
+	if (unlikely(!connlimit_rnd_inited)) {
+		get_random_bytes(&connlimit_rnd, sizeof(connlimit_rnd));
+		connlimit_rnd_inited = true;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i)
+		res.ip6[i] = addr->ip6[i] & mask->ip6[i];
+
+	return jhash2(res.ip6, ARRAY_SIZE(res.ip6), connlimit_rnd) & 0xFF;
+}
+
+static inline bool already_closed(const struct nf_conn *conn)
+{
+	u_int16_t proto = conn->tuplehash[0].tuple.dst.protonum;
+
+	if (proto == IPPROTO_TCP)
+		return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT;
+	else
+		return 0;
+}
+
+static inline unsigned int
+same_source_net(const union nf_conntrack_address *addr,
+		const union nf_conntrack_address *mask,
+		const union nf_conntrack_address *u3, unsigned int family)
+{
+	if (family == AF_INET) {
+		return (addr->ip & mask->ip) == (u3->ip & mask->ip);
+	} else {
+		union nf_conntrack_address lh, rh;
+		unsigned int i;
+
+		for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i) {
+			lh.ip6[i] = addr->ip6[i] & mask->ip6[i];
+			rh.ip6[i] = u3->ip6[i] & mask->ip6[i];
+		}
+
+		return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6)) == 0;
+	}
+}
+
+static int count_them(struct xt_connlimit_data *data,
+		      const struct nf_conntrack_tuple *tuple,
+		      const union nf_conntrack_address *addr,
+		      const union nf_conntrack_address *mask,
+		      const struct xt_match *match)
+{
+	struct nf_conntrack_tuple_hash *found;
+	struct xt_connlimit_conn *conn;
+	struct xt_connlimit_conn *tmp;
+	struct nf_conn *found_ct;
+	struct list_head *hash;
+	bool addit = true;
+	int matches = 0;
+
+
+	if (match->family == AF_INET6)
+		hash = &data->iphash[connlimit_iphash6(addr, mask)];
+	else
+		hash = &data->iphash[connlimit_iphash(addr->ip & mask->ip)];
+
+	read_lock_bh(&nf_conntrack_lock);
+
+	/* check the saved connections */
+	list_for_each_entry_safe(conn, tmp, hash, list) {
+		found    = __nf_conntrack_find(&conn->tuple, NULL);
+		found_ct = NULL;
+
+		if (found != NULL)
+			found_ct = nf_ct_tuplehash_to_ctrack(found);
+
+		if (found_ct != NULL &&
+		    nf_ct_tuple_equal(&conn->tuple, tuple) &&
+		    !already_closed(found_ct))
+			/*
+			 * Just to be sure we have it only once in the list.
+			 * We should not see tuples twice unless someone hooks
+			 * this into a table without "-p tcp --syn".
+			 */
+			addit = false;
+
+		if (found == NULL) {
+			/* this one is gone */
+			list_del(&conn->list);
+			kfree(conn);
+			continue;
+		}
+
+		if (already_closed(found_ct)) {
+			/*
+			 * we do not care about connections which are
+			 * closed already -> ditch it
+			 */
+			list_del(&conn->list);
+			kfree(conn);
+			continue;
+		}
+
+		if (same_source_net(addr, mask, &conn->tuple.src.u3,
+		    match->family))
+			/* same source network -> be counted! */
+			++matches;
+	}
+
+	read_unlock_bh(&nf_conntrack_lock);
+
+	if (addit) {
+		/* save the new connection in our list */
+		conn = kzalloc(sizeof(*conn), GFP_ATOMIC);
+		if (conn == NULL)
+			return -ENOMEM;
+		conn->tuple = *tuple;
+		list_add(&conn->list, hash);
+		++matches;
+	}
+
+	return matches;
+}
+
+static bool connlimit_match(const struct sk_buff *skb,
+			    const struct net_device *in,
+			    const struct net_device *out,
+			    const struct xt_match *match,
+			    const void *matchinfo, int offset,
+			    unsigned int protoff, bool *hotdrop)
+{
+	const struct xt_connlimit_info *info = matchinfo;
+	union nf_conntrack_address addr, mask;
+	struct nf_conntrack_tuple tuple;
+	const struct nf_conntrack_tuple *tuple_ptr = &tuple;
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn *ct;
+	int connections;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct != NULL)
+		tuple_ptr = &ct->tuplehash[0].tuple;
+	else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+				    match->family, &tuple))
+		goto hotdrop;
+
+	if (match->family == AF_INET6) {
+		const struct ipv6hdr *iph = ipv6_hdr(skb);
+		memcpy(&addr.ip6, &iph->saddr, sizeof(iph->saddr));
+		memcpy(&mask.ip6, info->v6_mask, sizeof(info->v6_mask));
+	} else {
+		const struct iphdr *iph = ip_hdr(skb);
+		addr.ip = iph->saddr;
+		mask.ip = info->v4_mask;
+	}
+
+	spin_lock_bh(&info->data->lock);
+	connections = count_them(info->data, tuple_ptr, &addr, &mask, match);
+	spin_unlock_bh(&info->data->lock);
+
+	if (connections < 0) {
+		/* kmalloc failed, drop it entirely */
+		*hotdrop = true;
+		return false;
+	}
+
+	return (connections > info->limit) ^ info->inverse;
+
+ hotdrop:
+	*hotdrop = true;
+	return false;
+}
+
+static bool connlimit_check(const char *tablename, const void *ip,
+			    const struct xt_match *match, void *matchinfo,
+			    unsigned int hook_mask)
+{
+	struct xt_connlimit_info *info = matchinfo;
+	unsigned int i;
+
+	if (nf_ct_l3proto_try_module_get(match->family) < 0) {
+		printk(KERN_WARNING "cannot load conntrack support for "
+		       "address family %u\n", match->family);
+		return false;
+	}
+
+	/* init private data */
+	info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL);
+	if (info->data == NULL) {
+		nf_ct_l3proto_module_put(match->family);
+		return false;
+	}
+
+	spin_lock_init(&info->data->lock);
+	for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i)
+		INIT_LIST_HEAD(&info->data->iphash[i]);
+
+	return true;
+}
+
+static void connlimit_destroy(const struct xt_match *match, void *matchinfo)
+{
+	struct xt_connlimit_info *info = matchinfo;
+	struct xt_connlimit_conn *conn;
+	struct xt_connlimit_conn *tmp;
+	struct list_head *hash = info->data->iphash;
+	unsigned int i;
+
+	nf_ct_l3proto_module_put(match->family);
+
+	for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) {
+		list_for_each_entry_safe(conn, tmp, &hash[i], list) {
+			list_del(&conn->list);
+			kfree(conn);
+		}
+	}
+
+	kfree(info->data);
+}
+
+static struct xt_match connlimit_reg[] __read_mostly = {
+	{
+		.name       = "connlimit",
+		.family     = AF_INET,
+		.checkentry = connlimit_check,
+		.match      = connlimit_match,
+		.matchsize  = sizeof(struct xt_connlimit_info),
+		.destroy    = connlimit_destroy,
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "connlimit",
+		.family     = AF_INET6,
+		.checkentry = connlimit_check,
+		.match      = connlimit_match,
+		.matchsize  = sizeof(struct xt_connlimit_info),
+		.destroy    = connlimit_destroy,
+		.me         = THIS_MODULE,
+	},
+};
+
+static int __init xt_connlimit_init(void)
+{
+	return xt_register_matches(connlimit_reg, ARRAY_SIZE(connlimit_reg));
+}
+
+static void __exit xt_connlimit_exit(void)
+{
+	xt_unregister_matches(connlimit_reg, ARRAY_SIZE(connlimit_reg));
+}
+
+module_init(xt_connlimit_init);
+module_exit(xt_connlimit_exit);
+MODULE_AUTHOR("Jan Engelhardt <jengelh@gmx.de>");
+MODULE_DESCRIPTION("netfilter xt_connlimit match module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_connlimit");
+MODULE_ALIAS("ip6t_connlimit");
-- 
cgit v1.3-8-gc7d7


From 4381ca3c23b07ba5b567f72325003020ddca0341 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Sun, 15 Jul 2007 21:00:11 +0100
Subject: fix return type of skb_checksum_complete()

It returns __sum16, not unsigned int

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9391e4a4c344..ce256438e619 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1639,7 +1639,7 @@ static inline int skb_csum_unnecessary(const struct sk_buff *skb)
  *	if skb->ip_summed is CHECKSUM_UNNECESSARY which indicates that the
  *	hardware has already verified the correctness of the checksum.
  */
-static inline unsigned int skb_checksum_complete(struct sk_buff *skb)
+static inline __sum16 skb_checksum_complete(struct sk_buff *skb)
 {
 	return skb_csum_unnecessary(skb) ?
 	       0 : __skb_checksum_complete(skb);
-- 
cgit v1.3-8-gc7d7


From 22e03f3b58dfcca30f0c8de185022132459638d1 Mon Sep 17 00:00:00 2001
From: Raphael Assenat <raph@8d.com>
Date: Tue, 27 Feb 2007 19:49:53 +0000
Subject: leds: Add generic GPIO LED driver

This patch adds support for GPIO connected leds via the new GPIO framework.

Information about leds (gpio, polarity, name, default trigger) is passed
to the driver via platform_data.

Signed-off-by: Raphael Assenat <raph@8d.com>
Signed-off-by: Richard Purdie <rpurdie@rpsys.net>
---
 drivers/leds/Kconfig     |   8 +++
 drivers/leds/Makefile    |   1 +
 drivers/leds/leds-gpio.c | 174 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/leds.h     |  14 ++++
 4 files changed, 197 insertions(+)
 create mode 100644 drivers/leds/leds-gpio.c

(limited to 'include/linux')

diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index 87d2046f866c..9ce3ca109c2f 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -95,6 +95,14 @@ config LEDS_COBALT
 	help
 	  This option enables support for the front LED on Cobalt Server
 
+config LEDS_GPIO
+	tristate "LED Support for GPIO connected LEDs"
+	depends on LEDS_CLASS && GENERIC_GPIO
+	help
+	  This option enables support for the LEDs connected to GPIO
+	  outputs. To be useful the particular board must have LEDs
+	  and they must be connected to the GPIO lines.
+
 comment "LED Triggers"
 
 config LEDS_TRIGGERS
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index aa2c18efa5b2..f8995c9bc2ea 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_LEDS_NET48XX)		+= leds-net48xx.o
 obj-$(CONFIG_LEDS_WRAP)			+= leds-wrap.o
 obj-$(CONFIG_LEDS_H1940)		+= leds-h1940.o
 obj-$(CONFIG_LEDS_COBALT)		+= leds-cobalt.o
+obj-$(CONFIG_LEDS_GPIO)			+= leds-gpio.o
 
 # LED Triggers
 obj-$(CONFIG_LEDS_TRIGGER_TIMER)	+= ledtrig-timer.o
diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c
new file mode 100644
index 000000000000..431dcb61902c
--- /dev/null
+++ b/drivers/leds/leds-gpio.c
@@ -0,0 +1,174 @@
+/*
+ * LEDs driver for GPIOs
+ *
+ * Copyright (C) 2007 8D Technologies inc.
+ * Raphael Assenat <raph@8d.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/leds.h>
+#include <asm/gpio.h>
+
+struct gpio_led_data {
+	struct led_classdev cdev;
+	unsigned gpio;
+	u8 active_low;
+};
+
+
+static void gpio_led_set(struct led_classdev *led_cdev,
+	enum led_brightness value)
+{
+	struct gpio_led_data *led_dat =
+		container_of(led_cdev, struct gpio_led_data, cdev);
+	int level;
+
+	if (value == LED_OFF)
+		level = 0;
+	else
+		level = 1;
+
+	if (led_dat->active_low)
+		level = !level;
+
+	gpio_set_value(led_dat->gpio, level);
+}
+
+static int __init gpio_led_probe(struct platform_device *pdev)
+{
+	struct gpio_led_platform_data *pdata = pdev->dev.platform_data;
+	struct gpio_led *cur_led;
+	struct gpio_led_data *leds_data, *led_dat;
+	int i, ret = 0;
+
+	if (!pdata)
+		return -EBUSY;
+
+	leds_data = kzalloc(sizeof(struct gpio_led_data) * pdata->num_leds,
+				GFP_KERNEL);
+	if (!leds_data)
+		return -ENOMEM;
+
+	for (i = 0; i < pdata->num_leds; i++) {
+		cur_led = &pdata->leds[i];
+		led_dat = &leds_data[i];
+
+		led_dat->cdev.name = cur_led->name;
+		led_dat->cdev.default_trigger = cur_led->default_trigger;
+		led_dat->gpio = cur_led->gpio;
+		led_dat->active_low = cur_led->active_low;
+		led_dat->cdev.brightness_set = gpio_led_set;
+		led_dat->cdev.brightness = cur_led->active_low ? LED_FULL : LED_OFF;
+
+		ret = gpio_request(led_dat->gpio, led_dat->cdev.name);
+		if (ret < 0)
+			goto err;
+
+		gpio_direction_output(led_dat->gpio, led_dat->active_low);
+
+		ret = led_classdev_register(&pdev->dev, &led_dat->cdev);
+		if (ret < 0) {
+			gpio_free(led_dat->gpio);
+			goto err;
+		}
+	}
+
+	platform_set_drvdata(pdev, leds_data);
+
+	return 0;
+
+err:
+	if (i > 0) {
+		for (i = i - 1; i >= 0; i--) {
+			led_classdev_unregister(&leds_data[i].cdev);
+			gpio_free(leds_data[i].gpio);
+		}
+	}
+	kfree(leds_data);
+
+	return ret;
+}
+
+static int __exit gpio_led_remove(struct platform_device *pdev)
+{
+	int i;
+	struct gpio_led_platform_data *pdata = pdev->dev.platform_data;
+	struct gpio_led_data *leds_data;
+
+	leds_data = platform_get_drvdata(pdev);
+
+	for (i = 0; i < pdata->num_leds; i++) {
+		led_classdev_unregister(&leds_data[i].cdev);
+		gpio_free(leds_data[i].gpio);
+	}
+	
+	kfree(leds_data);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int gpio_led_suspend(struct platform_device *pdev, pm_message_t state)
+{
+	struct gpio_led_platform_data *pdata = pdev->dev.platform_data;
+	struct gpio_led_data *leds_data;
+	int i;
+	
+	leds_data = platform_get_drvdata(pdev);
+
+	for (i = 0; i < pdata->num_leds; i++)
+		led_classdev_suspend(&leds_data[i].cdev);
+
+	return 0;
+}
+
+static int gpio_led_resume(struct platform_device *pdev)
+{
+	struct gpio_led_platform_data *pdata = pdev->dev.platform_data;
+	struct gpio_led_data *leds_data;
+	int i;
+
+	leds_data = platform_get_drvdata(pdev);
+
+	for (i = 0; i < pdata->num_leds; i++)
+		led_classdev_resume(&leds_data[i].cdev);
+
+	return 0;
+}
+#else
+#define gpio_led_suspend NULL
+#define gpio_led_resume NULL
+#endif
+
+static struct platform_driver gpio_led_driver = {
+	.remove		= __exit_p(gpio_led_remove),
+	.suspend	= gpio_led_suspend,
+	.resume		= gpio_led_resume,
+	.driver		= {
+		.name	= "leds-gpio",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init gpio_led_init(void)
+{
+	return platform_driver_probe(&gpio_led_driver, gpio_led_probe);
+}
+
+static void __exit gpio_led_exit(void)
+{
+	platform_driver_unregister(&gpio_led_driver);
+}
+
+module_init(gpio_led_init);
+module_exit(gpio_led_exit);
+
+MODULE_AUTHOR("Raphael Assenat <raph@8d.com>");
+MODULE_DESCRIPTION("GPIO LED driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 88afceffb7cb..059abfe219dc 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -110,4 +110,18 @@ extern void ledtrig_ide_activity(void);
 #define ledtrig_ide_activity() do {} while(0)
 #endif
 
+/* For the leds-gpio driver */
+struct gpio_led {
+	const char *name;
+	char *default_trigger;
+	unsigned 	gpio;
+	u8 		active_low;
+};
+
+struct gpio_led_platform_data {
+	int 		num_leds;
+	struct gpio_led *leds;
+};
+
+
 #endif		/* __LINUX_LEDS_H_INCLUDED */
-- 
cgit v1.3-8-gc7d7


From f8a7c6fe14f556ca8eeddce258cb21392d0c3a2f Mon Sep 17 00:00:00 2001
From: Richard Purdie <rpurdie@rpsys.net>
Date: Sun, 8 Jul 2007 23:19:31 +0100
Subject: leds: Convert from struct class_device to struct device

Convert the LEDs class from struct class_device to struct device
since class_device is scheduled for removal.

Signed-off-by: Richard Purdie <rpurdie@rpsys.net>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/leds/led-class.c     | 49 ++++++++++++++++++++------------------------
 drivers/leds/led-triggers.c  | 13 ++++++------
 drivers/leds/leds-locomo.c   |  2 +-
 drivers/leds/leds.h          |  8 +++++---
 drivers/leds/ledtrig-timer.c | 49 +++++++++++++++++++++-----------------------
 include/linux/leds.h         |  3 +--
 6 files changed, 59 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 3c1711210e38..4211293ce862 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -2,7 +2,7 @@
  * LED Class Core
  *
  * Copyright (C) 2005 John Lenz <lenz@cs.wisc.edu>
- * Copyright (C) 2005-2006 Richard Purdie <rpurdie@openedhand.com>
+ * Copyright (C) 2005-2007 Richard Purdie <rpurdie@openedhand.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -24,9 +24,10 @@
 
 static struct class *leds_class;
 
-static ssize_t led_brightness_show(struct class_device *dev, char *buf)
+static ssize_t led_brightness_show(struct device *dev, 
+		struct device_attribute *attr, char *buf)
 {
-	struct led_classdev *led_cdev = class_get_devdata(dev);
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
 	ssize_t ret = 0;
 
 	/* no lock needed for this */
@@ -36,10 +37,10 @@ static ssize_t led_brightness_show(struct class_device *dev, char *buf)
 	return ret;
 }
 
-static ssize_t led_brightness_store(struct class_device *dev,
-				const char *buf, size_t size)
+static ssize_t led_brightness_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t size)
 {
-	struct led_classdev *led_cdev = class_get_devdata(dev);
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
 	ssize_t ret = -EINVAL;
 	char *after;
 	unsigned long state = simple_strtoul(buf, &after, 10);
@@ -56,10 +57,9 @@ static ssize_t led_brightness_store(struct class_device *dev,
 	return ret;
 }
 
-static CLASS_DEVICE_ATTR(brightness, 0644, led_brightness_show,
-			led_brightness_store);
+static DEVICE_ATTR(brightness, 0644, led_brightness_show, led_brightness_store);
 #ifdef CONFIG_LEDS_TRIGGERS
-static CLASS_DEVICE_ATTR(trigger, 0644, led_trigger_show, led_trigger_store);
+static DEVICE_ATTR(trigger, 0644, led_trigger_show, led_trigger_store);
 #endif
 
 /**
@@ -93,16 +93,15 @@ int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
 {
 	int rc;
 
-	led_cdev->class_dev = class_device_create(leds_class, NULL, 0,
-						parent, "%s", led_cdev->name);
-	if (unlikely(IS_ERR(led_cdev->class_dev)))
-		return PTR_ERR(led_cdev->class_dev);
+	led_cdev->dev = device_create(leds_class, parent, 0, "%s",
+					    led_cdev->name);
+	if (unlikely(IS_ERR(led_cdev->dev)))
+		return PTR_ERR(led_cdev->dev);
 
-	class_set_devdata(led_cdev->class_dev, led_cdev);
+	dev_set_drvdata(led_cdev->dev, led_cdev);
 
 	/* register the attributes */
-	rc = class_device_create_file(led_cdev->class_dev,
-				      &class_device_attr_brightness);
+	rc = device_create_file(led_cdev->dev, &dev_attr_brightness);
 	if (rc)
 		goto err_out;
 
@@ -114,8 +113,7 @@ int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
 #ifdef CONFIG_LEDS_TRIGGERS
 	rwlock_init(&led_cdev->trigger_lock);
 
-	rc = class_device_create_file(led_cdev->class_dev,
-				      &class_device_attr_trigger);
+	rc = device_create_file(led_cdev->dev, &dev_attr_trigger);
 	if (rc)
 		goto err_out_led_list;
 
@@ -123,18 +121,17 @@ int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
 #endif
 
 	printk(KERN_INFO "Registered led device: %s\n",
-			led_cdev->class_dev->class_id);
+			led_cdev->name);
 
 	return 0;
 
 #ifdef CONFIG_LEDS_TRIGGERS
 err_out_led_list:
-	class_device_remove_file(led_cdev->class_dev,
-				&class_device_attr_brightness);
+	device_remove_file(led_cdev->dev, &dev_attr_brightness);
 	list_del(&led_cdev->node);
 #endif
 err_out:
-	class_device_unregister(led_cdev->class_dev);
+	device_unregister(led_cdev->dev);
 	return rc;
 }
 EXPORT_SYMBOL_GPL(led_classdev_register);
@@ -147,18 +144,16 @@ EXPORT_SYMBOL_GPL(led_classdev_register);
  */
 void led_classdev_unregister(struct led_classdev *led_cdev)
 {
-	class_device_remove_file(led_cdev->class_dev,
-				&class_device_attr_brightness);
+	device_remove_file(led_cdev->dev, &dev_attr_brightness);
 #ifdef CONFIG_LEDS_TRIGGERS
-	class_device_remove_file(led_cdev->class_dev,
-				&class_device_attr_trigger);
+	device_remove_file(led_cdev->dev, &dev_attr_trigger);
 	write_lock(&led_cdev->trigger_lock);
 	if (led_cdev->trigger)
 		led_trigger_set(led_cdev, NULL);
 	write_unlock(&led_cdev->trigger_lock);
 #endif
 
-	class_device_unregister(led_cdev->class_dev);
+	device_unregister(led_cdev->dev);
 
 	write_lock(&leds_list_lock);
 	list_del(&led_cdev->node);
diff --git a/drivers/leds/led-triggers.c b/drivers/leds/led-triggers.c
index b2438a03082b..575368c2b100 100644
--- a/drivers/leds/led-triggers.c
+++ b/drivers/leds/led-triggers.c
@@ -1,7 +1,7 @@
 /*
  * LED Triggers Core
  *
- * Copyright 2005-2006 Openedhand Ltd.
+ * Copyright 2005-2007 Openedhand Ltd.
  *
  * Author: Richard Purdie <rpurdie@openedhand.com>
  *
@@ -28,10 +28,10 @@
 static DEFINE_RWLOCK(triggers_list_lock);
 static LIST_HEAD(trigger_list);
 
-ssize_t led_trigger_store(struct class_device *dev, const char *buf,
-			size_t count)
+ssize_t led_trigger_store(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
 {
-	struct led_classdev *led_cdev = class_get_devdata(dev);
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
 	char trigger_name[TRIG_NAME_MAX];
 	struct led_trigger *trig;
 	size_t len;
@@ -67,9 +67,10 @@ ssize_t led_trigger_store(struct class_device *dev, const char *buf,
 }
 
 
-ssize_t led_trigger_show(struct class_device *dev, char *buf)
+ssize_t led_trigger_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
 {
-	struct led_classdev *led_cdev = class_get_devdata(dev);
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
 	struct led_trigger *trig;
 	int len = 0;
 
diff --git a/drivers/leds/leds-locomo.c b/drivers/leds/leds-locomo.c
index 6f2d449ba983..bfac499f3258 100644
--- a/drivers/leds/leds-locomo.c
+++ b/drivers/leds/leds-locomo.c
@@ -19,7 +19,7 @@
 static void locomoled_brightness_set(struct led_classdev *led_cdev,
 				enum led_brightness value, int offset)
 {
-	struct locomo_dev *locomo_dev = LOCOMO_DEV(led_cdev->class_dev->dev);
+	struct locomo_dev *locomo_dev = LOCOMO_DEV(led_cdev->dev);
 	unsigned long flags;
 
 	local_irq_save(flags);
diff --git a/drivers/leds/leds.h b/drivers/leds/leds.h
index a715c4ed93ff..f2f3884fe063 100644
--- a/drivers/leds/leds.h
+++ b/drivers/leds/leds.h
@@ -13,6 +13,7 @@
 #ifndef __LEDS_H_INCLUDED
 #define __LEDS_H_INCLUDED
 
+#include <linux/device.h>
 #include <linux/leds.h>
 
 static inline void led_set_brightness(struct led_classdev *led_cdev,
@@ -37,8 +38,9 @@ void led_trigger_set(struct led_classdev *led_cdev,
 #define led_trigger_set(x, y) do {} while(0)
 #endif
 
-ssize_t led_trigger_store(struct class_device *dev, const char *buf,
-			size_t count);
-ssize_t led_trigger_show(struct class_device *dev, char *buf);
+ssize_t led_trigger_store(struct device *dev, struct device_attribute *attr,
+			const char *buf, size_t count);
+ssize_t led_trigger_show(struct device *dev, struct device_attribute *attr,
+			char *buf);
 
 #endif	/* __LEDS_H_INCLUDED */
diff --git a/drivers/leds/ledtrig-timer.c b/drivers/leds/ledtrig-timer.c
index d756bdb01c59..ed9ff02c77ea 100644
--- a/drivers/leds/ledtrig-timer.c
+++ b/drivers/leds/ledtrig-timer.c
@@ -52,9 +52,10 @@ static void led_timer_function(unsigned long data)
 	mod_timer(&timer_data->timer, jiffies + msecs_to_jiffies(delay));
 }
 
-static ssize_t led_delay_on_show(struct class_device *dev, char *buf)
+static ssize_t led_delay_on_show(struct device *dev, 
+		struct device_attribute *attr, char *buf)
 {
-	struct led_classdev *led_cdev = class_get_devdata(dev);
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
 	struct timer_trig_data *timer_data = led_cdev->trigger_data;
 
 	sprintf(buf, "%lu\n", timer_data->delay_on);
@@ -62,10 +63,10 @@ static ssize_t led_delay_on_show(struct class_device *dev, char *buf)
 	return strlen(buf) + 1;
 }
 
-static ssize_t led_delay_on_store(struct class_device *dev, const char *buf,
-				size_t size)
+static ssize_t led_delay_on_store(struct device *dev, 
+		struct device_attribute *attr, const char *buf, size_t size)
 {
-	struct led_classdev *led_cdev = class_get_devdata(dev);
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
 	struct timer_trig_data *timer_data = led_cdev->trigger_data;
 	int ret = -EINVAL;
 	char *after;
@@ -84,9 +85,10 @@ static ssize_t led_delay_on_store(struct class_device *dev, const char *buf,
 	return ret;
 }
 
-static ssize_t led_delay_off_show(struct class_device *dev, char *buf)
+static ssize_t led_delay_off_show(struct device *dev, 
+		struct device_attribute *attr, char *buf)
 {
-	struct led_classdev *led_cdev = class_get_devdata(dev);
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
 	struct timer_trig_data *timer_data = led_cdev->trigger_data;
 
 	sprintf(buf, "%lu\n", timer_data->delay_off);
@@ -94,10 +96,10 @@ static ssize_t led_delay_off_show(struct class_device *dev, char *buf)
 	return strlen(buf) + 1;
 }
 
-static ssize_t led_delay_off_store(struct class_device *dev, const char *buf,
-				size_t size)
+static ssize_t led_delay_off_store(struct device *dev, 
+		struct device_attribute *attr, const char *buf, size_t size)
 {
-	struct led_classdev *led_cdev = class_get_devdata(dev);
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
 	struct timer_trig_data *timer_data = led_cdev->trigger_data;
 	int ret = -EINVAL;
 	char *after;
@@ -116,10 +118,8 @@ static ssize_t led_delay_off_store(struct class_device *dev, const char *buf,
 	return ret;
 }
 
-static CLASS_DEVICE_ATTR(delay_on, 0644, led_delay_on_show,
-			led_delay_on_store);
-static CLASS_DEVICE_ATTR(delay_off, 0644, led_delay_off_show,
-			led_delay_off_store);
+static DEVICE_ATTR(delay_on, 0644, led_delay_on_show, led_delay_on_store);
+static DEVICE_ATTR(delay_off, 0644, led_delay_off_show, led_delay_off_store);
 
 static void timer_trig_activate(struct led_classdev *led_cdev)
 {
@@ -136,18 +136,17 @@ static void timer_trig_activate(struct led_classdev *led_cdev)
 	timer_data->timer.function = led_timer_function;
 	timer_data->timer.data = (unsigned long) led_cdev;
 
-	rc = class_device_create_file(led_cdev->class_dev,
-				&class_device_attr_delay_on);
-	if (rc) goto err_out;
-	rc = class_device_create_file(led_cdev->class_dev,
-				&class_device_attr_delay_off);
-	if (rc) goto err_out_delayon;
+	rc = device_create_file(led_cdev->dev, &dev_attr_delay_on);
+	if (rc)
+		goto err_out;
+	rc = device_create_file(led_cdev->dev, &dev_attr_delay_off);
+	if (rc)
+		goto err_out_delayon;
 
 	return;
 
 err_out_delayon:
-	class_device_remove_file(led_cdev->class_dev,
-				&class_device_attr_delay_on);
+	device_remove_file(led_cdev->dev, &dev_attr_delay_on);
 err_out:
 	led_cdev->trigger_data = NULL;
 	kfree(timer_data);
@@ -158,10 +157,8 @@ static void timer_trig_deactivate(struct led_classdev *led_cdev)
 	struct timer_trig_data *timer_data = led_cdev->trigger_data;
 
 	if (timer_data) {
-		class_device_remove_file(led_cdev->class_dev,
-					&class_device_attr_delay_on);
-		class_device_remove_file(led_cdev->class_dev,
-					&class_device_attr_delay_off);
+		device_remove_file(led_cdev->dev, &dev_attr_delay_on);
+		device_remove_file(led_cdev->dev, &dev_attr_delay_off);
 		del_timer_sync(&timer_data->timer);
 		kfree(timer_data);
 	}
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 059abfe219dc..dc1178f6184b 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -16,7 +16,6 @@
 #include <linux/spinlock.h>
 
 struct device;
-struct class_device;
 /*
  * LED Core
  */
@@ -38,7 +37,7 @@ struct led_classdev {
 	void		(*brightness_set)(struct led_classdev *led_cdev,
 					  enum led_brightness brightness);
 
-	struct class_device	*class_dev;
+	struct device		*dev;
 	struct list_head	 node;			/* LED Device list */
 	char			*default_trigger;	/* Trigger to use */
 
-- 
cgit v1.3-8-gc7d7


From 655bfd7aebb12481ab9275284d9500bee5ba3e70 Mon Sep 17 00:00:00 2001
From: Richard Purdie <rpurdie@rpsys.net>
Date: Mon, 9 Jul 2007 12:17:24 +0100
Subject: backlight: Convert from struct class_device to struct device

Convert the backlight and LCD classes from struct class_device
to struct device since class_device is scheduled for removal.

One nasty API break is the backlight power attribute has had to be
renamed to bl_power and the LCD power attribute has had to be renamed
to lcd_power since the original names clash with the core. I can't see
a way around this.

Signed-off-by: Richard Purdie <rpurdie@rpsys.net>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/acpi/video.c                 |   4 +-
 drivers/usb/misc/appledisplay.c      |   4 +-
 drivers/video/aty/aty128fb.c         |   2 +-
 drivers/video/aty/atyfb_base.c       |   2 +-
 drivers/video/aty/radeon_backlight.c |   4 +-
 drivers/video/backlight/backlight.c  | 125 +++++++++++++++--------------------
 drivers/video/backlight/cr_bllcd.c   |   2 +-
 drivers/video/backlight/lcd.c        | 112 ++++++++++++++-----------------
 drivers/video/nvidia/nv_backlight.c  |   2 +-
 drivers/video/riva/fbdev.c           |   2 +-
 include/linux/backlight.h            |  11 ++-
 include/linux/lcd.h                  |  14 ++--
 12 files changed, 132 insertions(+), 152 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index 00d25b347255..7fd672af33ba 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -278,7 +278,7 @@ static int acpi_video_get_brightness(struct backlight_device *bd)
 {
 	unsigned long cur_level;
 	struct acpi_video_device *vd =
-		(struct acpi_video_device *)class_get_devdata(&bd->class_dev);
+		(struct acpi_video_device *)bl_get_data(bd);
 	acpi_video_device_lcd_get_level_current(vd, &cur_level);
 	return (int) cur_level;
 }
@@ -287,7 +287,7 @@ static int acpi_video_set_brightness(struct backlight_device *bd)
 {
 	int request_level = bd->props.brightness;
 	struct acpi_video_device *vd =
-		(struct acpi_video_device *)class_get_devdata(&bd->class_dev);
+		(struct acpi_video_device *)bl_get_data(bd);
 	acpi_video_device_lcd_set_level(vd, request_level);
 	return 0;
 }
diff --git a/drivers/usb/misc/appledisplay.c b/drivers/usb/misc/appledisplay.c
index cf70c16f0e3f..4e88553e1666 100644
--- a/drivers/usb/misc/appledisplay.c
+++ b/drivers/usb/misc/appledisplay.c
@@ -137,7 +137,7 @@ exit:
 
 static int appledisplay_bl_update_status(struct backlight_device *bd)
 {
-	struct appledisplay *pdata = class_get_devdata(&bd->class_dev);
+	struct appledisplay *pdata = bl_get_data(bd);
 	int retval;
 
 	pdata->msgdata[0] = 0x10;
@@ -158,7 +158,7 @@ static int appledisplay_bl_update_status(struct backlight_device *bd)
 
 static int appledisplay_bl_get_brightness(struct backlight_device *bd)
 {
-	struct appledisplay *pdata = class_get_devdata(&bd->class_dev);
+	struct appledisplay *pdata = bl_get_data(bd);
 	int retval;
 
 	retval = usb_control_msg(
diff --git a/drivers/video/aty/aty128fb.c b/drivers/video/aty/aty128fb.c
index 7fea4d8ae8e2..cfcbe37d2d70 100644
--- a/drivers/video/aty/aty128fb.c
+++ b/drivers/video/aty/aty128fb.c
@@ -1733,7 +1733,7 @@ static int aty128_bl_get_level_brightness(struct aty128fb_par *par,
 
 static int aty128_bl_update_status(struct backlight_device *bd)
 {
-	struct aty128fb_par *par = class_get_devdata(&bd->class_dev);
+	struct aty128fb_par *par = bl_get_data(bd);
 	unsigned int reg = aty_ld_le32(LVDS_GEN_CNTL);
 	int level;
 
diff --git a/drivers/video/aty/atyfb_base.c b/drivers/video/aty/atyfb_base.c
index 2fbff6317433..d2c68c3d8d76 100644
--- a/drivers/video/aty/atyfb_base.c
+++ b/drivers/video/aty/atyfb_base.c
@@ -2141,7 +2141,7 @@ static int aty_bl_get_level_brightness(struct atyfb_par *par, int level)
 
 static int aty_bl_update_status(struct backlight_device *bd)
 {
-	struct atyfb_par *par = class_get_devdata(&bd->class_dev);
+	struct atyfb_par *par = bl_get_data(bd);
 	unsigned int reg = aty_ld_lcd(LCD_MISC_CNTL, par);
 	int level;
 
diff --git a/drivers/video/aty/radeon_backlight.c b/drivers/video/aty/radeon_backlight.c
index 0be25fa5540c..1a056adb61c8 100644
--- a/drivers/video/aty/radeon_backlight.c
+++ b/drivers/video/aty/radeon_backlight.c
@@ -47,7 +47,7 @@ static int radeon_bl_get_level_brightness(struct radeon_bl_privdata *pdata,
 
 static int radeon_bl_update_status(struct backlight_device *bd)
 {
-	struct radeon_bl_privdata *pdata = class_get_devdata(&bd->class_dev);
+	struct radeon_bl_privdata *pdata = bl_get_data(bd);
 	struct radeonfb_info *rinfo = pdata->rinfo;
 	u32 lvds_gen_cntl, tmpPixclksCntl;
 	int level;
@@ -206,7 +206,7 @@ void radeonfb_bl_exit(struct radeonfb_info *rinfo)
 	if (bd) {
 		struct radeon_bl_privdata *pdata;
 
-		pdata = class_get_devdata(&bd->class_dev);
+		pdata = bl_get_data(bd);
 		backlight_device_unregister(bd);
 		kfree(pdata);
 		rinfo->info->bl_dev = NULL;
diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index 7e06223bca94..b26de8cf3112 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@ -69,18 +69,20 @@ static inline void backlight_unregister_fb(struct backlight_device *bd)
 }
 #endif /* CONFIG_FB */
 
-static ssize_t backlight_show_power(struct class_device *cdev, char *buf)
+static ssize_t backlight_show_power(struct device *dev,
+		struct device_attribute *attr,char *buf)
 {
-	struct backlight_device *bd = to_backlight_device(cdev);
+	struct backlight_device *bd = to_backlight_device(dev);
 
 	return sprintf(buf, "%d\n", bd->props.power);
 }
 
-static ssize_t backlight_store_power(struct class_device *cdev, const char *buf, size_t count)
+static ssize_t backlight_store_power(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
 {
 	int rc = -ENXIO;
 	char *endp;
-	struct backlight_device *bd = to_backlight_device(cdev);
+	struct backlight_device *bd = to_backlight_device(dev);
 	int power = simple_strtoul(buf, &endp, 0);
 	size_t size = endp - buf;
 
@@ -101,18 +103,20 @@ static ssize_t backlight_store_power(struct class_device *cdev, const char *buf,
 	return rc;
 }
 
-static ssize_t backlight_show_brightness(struct class_device *cdev, char *buf)
+static ssize_t backlight_show_brightness(struct device *dev,
+		struct device_attribute *attr, char *buf)
 {
-	struct backlight_device *bd = to_backlight_device(cdev);
+	struct backlight_device *bd = to_backlight_device(dev);
 
 	return sprintf(buf, "%d\n", bd->props.brightness);
 }
 
-static ssize_t backlight_store_brightness(struct class_device *cdev, const char *buf, size_t count)
+static ssize_t backlight_store_brightness(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
 {
 	int rc = -ENXIO;
 	char *endp;
-	struct backlight_device *bd = to_backlight_device(cdev);
+	struct backlight_device *bd = to_backlight_device(dev);
 	int brightness = simple_strtoul(buf, &endp, 0);
 	size_t size = endp - buf;
 
@@ -138,18 +142,19 @@ static ssize_t backlight_store_brightness(struct class_device *cdev, const char
 	return rc;
 }
 
-static ssize_t backlight_show_max_brightness(struct class_device *cdev, char *buf)
+static ssize_t backlight_show_max_brightness(struct device *dev,
+		struct device_attribute *attr, char *buf)
 {
-	struct backlight_device *bd = to_backlight_device(cdev);
+	struct backlight_device *bd = to_backlight_device(dev);
 
 	return sprintf(buf, "%d\n", bd->props.max_brightness);
 }
 
-static ssize_t backlight_show_actual_brightness(struct class_device *cdev,
-						char *buf)
+static ssize_t backlight_show_actual_brightness(struct device *dev,
+		struct device_attribute *attr, char *buf)
 {
 	int rc = -ENXIO;
-	struct backlight_device *bd = to_backlight_device(cdev);
+	struct backlight_device *bd = to_backlight_device(dev);
 
 	mutex_lock(&bd->ops_lock);
 	if (bd->ops && bd->ops->get_brightness)
@@ -159,31 +164,22 @@ static ssize_t backlight_show_actual_brightness(struct class_device *cdev,
 	return rc;
 }
 
-static void backlight_class_release(struct class_device *dev)
+struct class *backlight_class;
+
+static void bl_device_release(struct device *dev)
 {
 	struct backlight_device *bd = to_backlight_device(dev);
 	kfree(bd);
 }
 
-static struct class backlight_class = {
-	.name = "backlight",
-	.release = backlight_class_release,
-};
-
-#define DECLARE_ATTR(_name,_mode,_show,_store)			\
-{							 	\
-	.attr	= { .name = __stringify(_name), .mode = _mode }, \
-	.show	= _show,					\
-	.store	= _store,					\
-}
-
-static const struct class_device_attribute bl_class_device_attributes[] = {
-	DECLARE_ATTR(power, 0644, backlight_show_power, backlight_store_power),
-	DECLARE_ATTR(brightness, 0644, backlight_show_brightness,
+static struct device_attribute bl_device_attributes[] = {
+	__ATTR(bl_power, 0644, backlight_show_power, backlight_store_power),
+	__ATTR(brightness, 0644, backlight_show_brightness,
 		     backlight_store_brightness),
-	DECLARE_ATTR(actual_brightness, 0444, backlight_show_actual_brightness,
+	__ATTR(actual_brightness, 0444, backlight_show_actual_brightness,
 		     NULL),
-	DECLARE_ATTR(max_brightness, 0444, backlight_show_max_brightness, NULL),
+	__ATTR(max_brightness, 0444, backlight_show_max_brightness, NULL),
+	__ATTR_NULL,
 };
 
 /**
@@ -191,22 +187,20 @@ static const struct class_device_attribute bl_class_device_attributes[] = {
  *   backlight_device class.
  * @name: the name of the new object(must be the same as the name of the
  *   respective framebuffer device).
- * @devdata: an optional pointer to be stored in the class_device. The
- *   methods may retrieve it by using class_get_devdata(&bd->class_dev).
+ * @devdata: an optional pointer to be stored for private driver use. The
+ *   methods may retrieve it by using bl_get_data(bd).
  * @ops: the backlight operations structure.
  *
- * Creates and registers new backlight class_device. Returns either an
+ * Creates and registers new backlight device. Returns either an
  * ERR_PTR() or a pointer to the newly allocated device.
  */
 struct backlight_device *backlight_device_register(const char *name,
-	struct device *dev,
-	void *devdata,
-	struct backlight_ops *ops)
+		struct device *parent, void *devdata, struct backlight_ops *ops)
 {
-	int i, rc;
 	struct backlight_device *new_bd;
+	int rc;
 
-	pr_debug("backlight_device_alloc: name=%s\n", name);
+	pr_debug("backlight_device_register: name=%s\n", name);
 
 	new_bd = kzalloc(sizeof(struct backlight_device), GFP_KERNEL);
 	if (!new_bd)
@@ -214,13 +208,14 @@ struct backlight_device *backlight_device_register(const char *name,
 
 	mutex_init(&new_bd->update_lock);
 	mutex_init(&new_bd->ops_lock);
-	new_bd->ops = ops;
-	new_bd->class_dev.class = &backlight_class;
-	new_bd->class_dev.dev = dev;
-	strlcpy(new_bd->class_dev.class_id, name, KOBJ_NAME_LEN);
-	class_set_devdata(&new_bd->class_dev, devdata);
 
-	rc = class_device_register(&new_bd->class_dev);
+	new_bd->dev.class = backlight_class;
+	new_bd->dev.parent = parent;
+	new_bd->dev.release = bl_device_release;
+	strlcpy(new_bd->dev.bus_id, name, BUS_ID_SIZE);
+	dev_set_drvdata(&new_bd->dev, devdata);
+
+	rc = device_register(&new_bd->dev);
 	if (rc) {
 		kfree(new_bd);
 		return ERR_PTR(rc);
@@ -228,23 +223,11 @@ struct backlight_device *backlight_device_register(const char *name,
 
 	rc = backlight_register_fb(new_bd);
 	if (rc) {
-		class_device_unregister(&new_bd->class_dev);
+		device_unregister(&new_bd->dev);
 		return ERR_PTR(rc);
 	}
 
-
-	for (i = 0; i < ARRAY_SIZE(bl_class_device_attributes); i++) {
-		rc = class_device_create_file(&new_bd->class_dev,
-					      &bl_class_device_attributes[i]);
-		if (rc) {
-			while (--i >= 0)
-				class_device_remove_file(&new_bd->class_dev,
-							 &bl_class_device_attributes[i]);
-			class_device_unregister(&new_bd->class_dev);
-			/* No need to kfree(new_bd) since release() method was called */
-			return ERR_PTR(rc);
-		}
-	}
+	new_bd->ops = ops;
 
 #ifdef CONFIG_PMAC_BACKLIGHT
 	mutex_lock(&pmac_backlight_mutex);
@@ -265,42 +248,40 @@ EXPORT_SYMBOL(backlight_device_register);
  */
 void backlight_device_unregister(struct backlight_device *bd)
 {
-	int i;
-
 	if (!bd)
 		return;
 
-	pr_debug("backlight_device_unregister: name=%s\n", bd->class_dev.class_id);
-
 #ifdef CONFIG_PMAC_BACKLIGHT
 	mutex_lock(&pmac_backlight_mutex);
 	if (pmac_backlight == bd)
 		pmac_backlight = NULL;
 	mutex_unlock(&pmac_backlight_mutex);
 #endif
-
-	for (i = 0; i < ARRAY_SIZE(bl_class_device_attributes); i++)
-		class_device_remove_file(&bd->class_dev,
-					 &bl_class_device_attributes[i]);
-
 	mutex_lock(&bd->ops_lock);
 	bd->ops = NULL;
 	mutex_unlock(&bd->ops_lock);
 
 	backlight_unregister_fb(bd);
-
-	class_device_unregister(&bd->class_dev);
+	device_unregister(&bd->dev);
 }
 EXPORT_SYMBOL(backlight_device_unregister);
 
 static void __exit backlight_class_exit(void)
 {
-	class_unregister(&backlight_class);
+	class_destroy(backlight_class);
 }
 
 static int __init backlight_class_init(void)
 {
-	return class_register(&backlight_class);
+	backlight_class = class_create(THIS_MODULE, "backlight");
+	if (IS_ERR(backlight_class)) {
+		printk(KERN_WARNING "Unable to create backlight class; errno = %ld\n",
+				PTR_ERR(backlight_class));
+		return PTR_ERR(backlight_class);
+	}
+
+	backlight_class->dev_attrs = bl_device_attributes;
+	return 0;
 }
 
 /*
diff --git a/drivers/video/backlight/cr_bllcd.c b/drivers/video/backlight/cr_bllcd.c
index e9bbc3455c94..3633b6e93e27 100644
--- a/drivers/video/backlight/cr_bllcd.c
+++ b/drivers/video/backlight/cr_bllcd.c
@@ -202,7 +202,7 @@ static int cr_backlight_probe(struct platform_device *pdev)
 	}
 
 	crp->cr_lcd_device = lcd_device_register("cr-lcd",
-							&pdev->dev,
+							&pdev->dev, NULL
 							&cr_lcd_ops);
 
 	if (IS_ERR(crp->cr_lcd_device)) {
diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c
index 648b53c1fdea..6f652c65fae1 100644
--- a/drivers/video/backlight/lcd.c
+++ b/drivers/video/backlight/lcd.c
@@ -61,10 +61,11 @@ static inline void lcd_unregister_fb(struct lcd_device *ld)
 }
 #endif /* CONFIG_FB */
 
-static ssize_t lcd_show_power(struct class_device *cdev, char *buf)
+static ssize_t lcd_show_power(struct device *dev, struct device_attribute *attr,
+		char *buf)
 {
 	int rc;
-	struct lcd_device *ld = to_lcd_device(cdev);
+	struct lcd_device *ld = to_lcd_device(dev);
 
 	mutex_lock(&ld->ops_lock);
 	if (ld->ops && ld->ops->get_power)
@@ -76,11 +77,12 @@ static ssize_t lcd_show_power(struct class_device *cdev, char *buf)
 	return rc;
 }
 
-static ssize_t lcd_store_power(struct class_device *cdev, const char *buf, size_t count)
+static ssize_t lcd_store_power(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
 {
 	int rc = -ENXIO;
 	char *endp;
-	struct lcd_device *ld = to_lcd_device(cdev);
+	struct lcd_device *ld = to_lcd_device(dev);
 	int power = simple_strtoul(buf, &endp, 0);
 	size_t size = endp - buf;
 
@@ -100,10 +102,11 @@ static ssize_t lcd_store_power(struct class_device *cdev, const char *buf, size_
 	return rc;
 }
 
-static ssize_t lcd_show_contrast(struct class_device *cdev, char *buf)
+static ssize_t lcd_show_contrast(struct device *dev,
+		struct device_attribute *attr, char *buf)
 {
 	int rc = -ENXIO;
-	struct lcd_device *ld = to_lcd_device(cdev);
+	struct lcd_device *ld = to_lcd_device(dev);
 
 	mutex_lock(&ld->ops_lock);
 	if (ld->ops && ld->ops->get_contrast)
@@ -113,11 +116,12 @@ static ssize_t lcd_show_contrast(struct class_device *cdev, char *buf)
 	return rc;
 }
 
-static ssize_t lcd_store_contrast(struct class_device *cdev, const char *buf, size_t count)
+static ssize_t lcd_store_contrast(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
 {
 	int rc = -ENXIO;
 	char *endp;
-	struct lcd_device *ld = to_lcd_device(cdev);
+	struct lcd_device *ld = to_lcd_device(dev);
 	int contrast = simple_strtoul(buf, &endp, 0);
 	size_t size = endp - buf;
 
@@ -137,53 +141,45 @@ static ssize_t lcd_store_contrast(struct class_device *cdev, const char *buf, si
 	return rc;
 }
 
-static ssize_t lcd_show_max_contrast(struct class_device *cdev, char *buf)
+static ssize_t lcd_show_max_contrast(struct device *dev,
+		struct device_attribute *attr, char *buf)
 {
-	struct lcd_device *ld = to_lcd_device(cdev);
+	struct lcd_device *ld = to_lcd_device(dev);
 
 	return sprintf(buf, "%d\n", ld->props.max_contrast);
 }
 
-static void lcd_class_release(struct class_device *dev)
+struct class *lcd_class;
+
+static void lcd_device_release(struct device *dev)
 {
 	struct lcd_device *ld = to_lcd_device(dev);
 	kfree(ld);
 }
 
-static struct class lcd_class = {
-	.name = "lcd",
-	.release = lcd_class_release,
-};
-
-#define DECLARE_ATTR(_name,_mode,_show,_store)			\
-{							 	\
-	.attr	= { .name = __stringify(_name), .mode = _mode }, \
-	.show	= _show,					\
-	.store	= _store,					\
-}
-
-static const struct class_device_attribute lcd_class_device_attributes[] = {
-	DECLARE_ATTR(power, 0644, lcd_show_power, lcd_store_power),
-	DECLARE_ATTR(contrast, 0644, lcd_show_contrast, lcd_store_contrast),
-	DECLARE_ATTR(max_contrast, 0444, lcd_show_max_contrast, NULL),
+static struct device_attribute lcd_device_attributes[] = {
+	__ATTR(lcd_power, 0644, lcd_show_power, lcd_store_power),
+	__ATTR(contrast, 0644, lcd_show_contrast, lcd_store_contrast),
+	__ATTR(max_contrast, 0444, lcd_show_max_contrast, NULL),
+	__ATTR_NULL,
 };
 
 /**
  * lcd_device_register - register a new object of lcd_device class.
  * @name: the name of the new object(must be the same as the name of the
  *   respective framebuffer device).
- * @devdata: an optional pointer to be stored in the class_device. The
- *   methods may retrieve it by using class_get_devdata(ld->class_dev).
+ * @devdata: an optional pointer to be stored in the device. The
+ *   methods may retrieve it by using lcd_get_data(ld).
  * @ops: the lcd operations structure.
  *
- * Creates and registers a new lcd class_device. Returns either an ERR_PTR()
+ * Creates and registers a new lcd device. Returns either an ERR_PTR()
  * or a pointer to the newly allocated device.
  */
-struct lcd_device *lcd_device_register(const char *name, void *devdata,
-				       struct lcd_ops *ops)
+struct lcd_device *lcd_device_register(const char *name, struct device *parent,
+		void *devdata, struct lcd_ops *ops)
 {
-	int i, rc;
 	struct lcd_device *new_ld;
+	int rc;
 
 	pr_debug("lcd_device_register: name=%s\n", name);
 
@@ -193,12 +189,14 @@ struct lcd_device *lcd_device_register(const char *name, void *devdata,
 
 	mutex_init(&new_ld->ops_lock);
 	mutex_init(&new_ld->update_lock);
-	new_ld->ops = ops;
-	new_ld->class_dev.class = &lcd_class;
-	strlcpy(new_ld->class_dev.class_id, name, KOBJ_NAME_LEN);
-	class_set_devdata(&new_ld->class_dev, devdata);
 
-	rc = class_device_register(&new_ld->class_dev);
+	new_ld->dev.class = lcd_class;
+	new_ld->dev.parent = parent;
+	new_ld->dev.release = lcd_device_release;
+	strlcpy(new_ld->dev.bus_id, name, BUS_ID_SIZE);
+	dev_set_drvdata(&new_ld->dev, devdata);
+
+	rc = device_register(&new_ld->dev);
 	if (rc) {
 		kfree(new_ld);
 		return ERR_PTR(rc);
@@ -206,22 +204,11 @@ struct lcd_device *lcd_device_register(const char *name, void *devdata,
 
 	rc = lcd_register_fb(new_ld);
 	if (rc) {
-		class_device_unregister(&new_ld->class_dev);
+		device_unregister(&new_ld->dev);
 		return ERR_PTR(rc);
 	}
 
-	for (i = 0; i < ARRAY_SIZE(lcd_class_device_attributes); i++) {
-		rc = class_device_create_file(&new_ld->class_dev,
-					      &lcd_class_device_attributes[i]);
-		if (rc) {
-			while (--i >= 0)
-				class_device_remove_file(&new_ld->class_dev,
-							 &lcd_class_device_attributes[i]);
-			class_device_unregister(&new_ld->class_dev);
-			/* No need to kfree(new_ld) since release() method was called */
-			return ERR_PTR(rc);
-		}
-	}
+	new_ld->ops = ops;
 
 	return new_ld;
 }
@@ -235,33 +222,34 @@ EXPORT_SYMBOL(lcd_device_register);
  */
 void lcd_device_unregister(struct lcd_device *ld)
 {
-	int i;
-
 	if (!ld)
 		return;
 
-	pr_debug("lcd_device_unregister: name=%s\n", ld->class_dev.class_id);
-
-	for (i = 0; i < ARRAY_SIZE(lcd_class_device_attributes); i++)
-		class_device_remove_file(&ld->class_dev,
-					 &lcd_class_device_attributes[i]);
-
 	mutex_lock(&ld->ops_lock);
 	ld->ops = NULL;
 	mutex_unlock(&ld->ops_lock);
 	lcd_unregister_fb(ld);
-	class_device_unregister(&ld->class_dev);
+
+	device_unregister(&ld->dev);
 }
 EXPORT_SYMBOL(lcd_device_unregister);
 
 static void __exit lcd_class_exit(void)
 {
-	class_unregister(&lcd_class);
+	class_destroy(lcd_class);
 }
 
 static int __init lcd_class_init(void)
 {
-	return class_register(&lcd_class);
+	lcd_class = class_create(THIS_MODULE, "lcd");
+	if (IS_ERR(lcd_class)) {
+		printk(KERN_WARNING "Unable to create backlight class; errno = %ld\n",
+				PTR_ERR(lcd_class));
+		return PTR_ERR(lcd_class);
+	}
+
+	lcd_class->dev_attrs = lcd_device_attributes;
+	return 0;
 }
 
 /*
diff --git a/drivers/video/nvidia/nv_backlight.c b/drivers/video/nvidia/nv_backlight.c
index 43f62d8ee41d..443e3c85a9a0 100644
--- a/drivers/video/nvidia/nv_backlight.c
+++ b/drivers/video/nvidia/nv_backlight.c
@@ -50,7 +50,7 @@ static int nvidia_bl_get_level_brightness(struct nvidia_par *par,
 
 static int nvidia_bl_update_status(struct backlight_device *bd)
 {
-	struct nvidia_par *par = class_get_devdata(&bd->class_dev);
+	struct nvidia_par *par = bl_get_data(bd);
 	u32 tmp_pcrt, tmp_pmc, fpcontrol;
 	int level;
 
diff --git a/drivers/video/riva/fbdev.c b/drivers/video/riva/fbdev.c
index 0fe547842c64..d251174d8baa 100644
--- a/drivers/video/riva/fbdev.c
+++ b/drivers/video/riva/fbdev.c
@@ -307,7 +307,7 @@ static int riva_bl_get_level_brightness(struct riva_par *par,
 
 static int riva_bl_update_status(struct backlight_device *bd)
 {
-	struct riva_par *par = class_get_devdata(&bd->class_dev);
+	struct riva_par *par = bl_get_data(bd);
 	U032 tmp_pcrt, tmp_pmc;
 	int level;
 
diff --git a/include/linux/backlight.h b/include/linux/backlight.h
index 1023ba0d6e55..c897c7b03858 100644
--- a/include/linux/backlight.h
+++ b/include/linux/backlight.h
@@ -69,8 +69,8 @@ struct backlight_device {
 
 	/* The framebuffer notifier block */
 	struct notifier_block fb_notif;
-	/* The class device structure */
-	struct class_device class_dev;
+
+	struct device dev;
 };
 
 static inline void backlight_update_status(struct backlight_device *bd)
@@ -85,6 +85,11 @@ extern struct backlight_device *backlight_device_register(const char *name,
 	struct device *dev, void *devdata, struct backlight_ops *ops);
 extern void backlight_device_unregister(struct backlight_device *bd);
 
-#define to_backlight_device(obj) container_of(obj, struct backlight_device, class_dev)
+#define to_backlight_device(obj) container_of(obj, struct backlight_device, dev)
+
+static inline void * bl_get_data(struct backlight_device *bl_dev)
+{
+	return dev_get_drvdata(&bl_dev->dev);
+}
 
 #endif
diff --git a/include/linux/lcd.h b/include/linux/lcd.h
index 598793c0745b..1d379787f2e7 100644
--- a/include/linux/lcd.h
+++ b/include/linux/lcd.h
@@ -62,8 +62,8 @@ struct lcd_device {
 	struct mutex update_lock;
 	/* The framebuffer notifier block */
 	struct notifier_block fb_notif;
-	/* The class device structure */
-	struct class_device class_dev;
+
+	struct device dev;
 };
 
 static inline void lcd_set_power(struct lcd_device *ld, int power)
@@ -75,9 +75,15 @@ static inline void lcd_set_power(struct lcd_device *ld, int power)
 }
 
 extern struct lcd_device *lcd_device_register(const char *name,
-	void *devdata, struct lcd_ops *ops);
+	struct device *parent, void *devdata, struct lcd_ops *ops);
 extern void lcd_device_unregister(struct lcd_device *ld);
 
-#define to_lcd_device(obj) container_of(obj, struct lcd_device, class_dev)
+#define to_lcd_device(obj) container_of(obj, struct lcd_device, dev)
+
+static inline void * lcd_get_data(struct lcd_device *ld_dev)
+{
+	return dev_get_drvdata(&ld_dev->dev);
+}
+
 
 #endif
-- 
cgit v1.3-8-gc7d7


From 3d6392cfbd7dc11f23058e3493683afab4ac13a3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 9 Jul 2007 12:38:05 +0200
Subject: bsg: support for full generic block layer SG v3

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Kconfig            |   7 +
 block/Makefile           |   1 +
 block/bsg.c              | 997 +++++++++++++++++++++++++++++++++++++++++++++++
 block/ll_rw_blk.c        |   8 +
 block/scsi_ioctl.c       | 163 +++++---
 drivers/ide/ide-floppy.c |  29 +-
 drivers/ide/ide.c        |  10 +-
 include/linux/blkdev.h   |  12 +
 include/linux/bsg.h      |  21 +
 include/linux/genhd.h    |   2 +
 10 files changed, 1171 insertions(+), 79 deletions(-)
 create mode 100644 block/bsg.c
 create mode 100644 include/linux/bsg.h

(limited to 'include/linux')

diff --git a/block/Kconfig b/block/Kconfig
index 285935134bcd..da12f2649cce 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -51,4 +51,11 @@ config LSF
 
 endif # BLOCK
 
+config BLK_DEV_BSG
+	bool "Block layer SG support"
+	default y
+	---help---
+	Saying Y here will enable generic SG (SCSI generic) v3
+	support for any block device.
+
 source block/Kconfig.iosched
diff --git a/block/Makefile b/block/Makefile
index 4b84d0d5947b..959feeb253be 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -4,6 +4,7 @@
 
 obj-$(CONFIG_BLOCK) := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o
 
+obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_AS)	+= as-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
diff --git a/block/bsg.c b/block/bsg.c
new file mode 100644
index 000000000000..724b69391cdc
--- /dev/null
+++ b/block/bsg.c
@@ -0,0 +1,997 @@
+/*
+ * bsg.c - block layer implementation of the sg v3 interface
+ *
+ * Copyright (C) 2004 Jens Axboe <axboe@suse.de> SUSE Labs
+ * Copyright (C) 2004 Peter M. Jones <pjones@redhat.com>
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License version 2.  See the file "COPYING" in the main directory of this
+ *  archive for more details.
+ *
+ */
+/*
+ * TODO
+ *	- Should this get merged, block/scsi_ioctl.c will be migrated into
+ *	  this file. To keep maintenance down, it's easier to have them
+ *	  seperated right now.
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/file.h>
+#include <linux/blkdev.h>
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/percpu.h>
+#include <linux/uio.h>
+#include <linux/bsg.h>
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_ioctl.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/sg.h>
+
+static char bsg_version[] = "block layer sg (bsg) 0.4";
+
+struct bsg_command;
+
+struct bsg_device {
+	struct gendisk *disk;
+	request_queue_t *queue;
+	spinlock_t lock;
+	struct list_head busy_list;
+	struct list_head done_list;
+	struct hlist_node dev_list;
+	atomic_t ref_count;
+	int minor;
+	int queued_cmds;
+	int done_cmds;
+	unsigned long *cmd_bitmap;
+	struct bsg_command *cmd_map;
+	wait_queue_head_t wq_done;
+	wait_queue_head_t wq_free;
+	char name[BDEVNAME_SIZE];
+	int max_queue;
+	unsigned long flags;
+};
+
+enum {
+	BSG_F_BLOCK		= 1,
+	BSG_F_WRITE_PERM	= 2,
+};
+
+/*
+ * command allocation bitmap defines
+ */
+#define BSG_CMDS_PAGE_ORDER	(1)
+#define BSG_CMDS_PER_LONG	(sizeof(unsigned long) * 8)
+#define BSG_CMDS_MASK		(BSG_CMDS_PER_LONG - 1)
+#define BSG_CMDS_BYTES		(PAGE_SIZE * (1 << BSG_CMDS_PAGE_ORDER))
+#define BSG_CMDS		(BSG_CMDS_BYTES / sizeof(struct bsg_command))
+
+#undef BSG_DEBUG
+
+#ifdef BSG_DEBUG
+#define dprintk(fmt, args...) printk(KERN_ERR "%s: " fmt, __FUNCTION__, ##args)
+#else
+#define dprintk(fmt, args...)
+#endif
+
+#define list_entry_bc(entry)	list_entry((entry), struct bsg_command, list)
+
+/*
+ * just for testing
+ */
+#define BSG_MAJOR	(240)
+
+static DEFINE_MUTEX(bsg_mutex);
+static int bsg_device_nr;
+
+#define BSG_LIST_SIZE	(8)
+#define bsg_list_idx(minor)	((minor) & (BSG_LIST_SIZE - 1))
+static struct hlist_head bsg_device_list[BSG_LIST_SIZE];
+
+static struct class *bsg_class;
+static LIST_HEAD(bsg_class_list);
+
+/*
+ * our internal command type
+ */
+struct bsg_command {
+	struct bsg_device *bd;
+	struct list_head list;
+	struct request *rq;
+	struct bio *bio;
+	int err;
+	struct sg_io_hdr hdr;
+	struct sg_io_hdr __user *uhdr;
+	char sense[SCSI_SENSE_BUFFERSIZE];
+};
+
+static void bsg_free_command(struct bsg_command *bc)
+{
+	struct bsg_device *bd = bc->bd;
+	unsigned long bitnr = bc - bd->cmd_map;
+	unsigned long flags;
+
+	dprintk("%s: command bit offset %lu\n", bd->name, bitnr);
+
+	spin_lock_irqsave(&bd->lock, flags);
+	bd->queued_cmds--;
+	__clear_bit(bitnr, bd->cmd_bitmap);
+	spin_unlock_irqrestore(&bd->lock, flags);
+
+	wake_up(&bd->wq_free);
+}
+
+static struct bsg_command *__bsg_alloc_command(struct bsg_device *bd)
+{
+	struct bsg_command *bc = NULL;
+	unsigned long *map;
+	int free_nr;
+
+	spin_lock_irq(&bd->lock);
+
+	if (bd->queued_cmds >= bd->max_queue)
+		goto out;
+
+	for (free_nr = 0, map = bd->cmd_bitmap; *map == ~0UL; map++)
+		free_nr += BSG_CMDS_PER_LONG;
+
+	BUG_ON(*map == ~0UL);
+
+	bd->queued_cmds++;
+	free_nr += ffz(*map);
+	__set_bit(free_nr, bd->cmd_bitmap);
+	spin_unlock_irq(&bd->lock);
+
+	bc = bd->cmd_map + free_nr;
+	memset(bc, 0, sizeof(*bc));
+	bc->bd = bd;
+	INIT_LIST_HEAD(&bc->list);
+	dprintk("%s: returning free cmd %p (bit %d)\n", bd->name, bc, free_nr);
+	return bc;
+out:
+	dprintk("%s: failed (depth %d)\n", bd->name, bd->queued_cmds);
+	spin_unlock_irq(&bd->lock);
+	return bc;
+}
+
+static inline void
+bsg_del_done_cmd(struct bsg_device *bd, struct bsg_command *bc)
+{
+	bd->done_cmds--;
+	list_del(&bc->list);
+}
+
+static inline void
+bsg_add_done_cmd(struct bsg_device *bd, struct bsg_command *bc)
+{
+	bd->done_cmds++;
+	list_add_tail(&bc->list, &bd->done_list);
+	wake_up(&bd->wq_done);
+}
+
+static inline int bsg_io_schedule(struct bsg_device *bd, int state)
+{
+	DEFINE_WAIT(wait);
+	int ret = 0;
+
+	spin_lock_irq(&bd->lock);
+
+	BUG_ON(bd->done_cmds > bd->queued_cmds);
+
+	/*
+	 * -ENOSPC or -ENODATA?  I'm going for -ENODATA, meaning "I have no
+	 * work to do", even though we return -ENOSPC after this same test
+	 * during bsg_write() -- there, it means our buffer can't have more
+	 * bsg_commands added to it, thus has no space left.
+	 */
+	if (bd->done_cmds == bd->queued_cmds) {
+		ret = -ENODATA;
+		goto unlock;
+	}
+
+	if (!test_bit(BSG_F_BLOCK, &bd->flags)) {
+		ret = -EAGAIN;
+		goto unlock;
+	}
+
+	prepare_to_wait(&bd->wq_done, &wait, state);
+	spin_unlock_irq(&bd->lock);
+	io_schedule();
+	finish_wait(&bd->wq_done, &wait);
+
+	if ((state == TASK_INTERRUPTIBLE) && signal_pending(current))
+		ret = -ERESTARTSYS;
+
+	return ret;
+unlock:
+	spin_unlock_irq(&bd->lock);
+	return ret;
+}
+
+/*
+ * get a new free command, blocking if needed and specified
+ */
+static struct bsg_command *bsg_get_command(struct bsg_device *bd)
+{
+	struct bsg_command *bc;
+	int ret;
+
+	do {
+		bc = __bsg_alloc_command(bd);
+		if (bc)
+			break;
+
+		ret = bsg_io_schedule(bd, TASK_INTERRUPTIBLE);
+		if (ret) {
+			bc = ERR_PTR(ret);
+			break;
+		}
+
+	} while (1);
+
+	return bc;
+}
+
+/*
+ * Check if sg_io_hdr from user is allowed and valid
+ */
+static int
+bsg_validate_sghdr(request_queue_t *q, struct sg_io_hdr *hdr, int *rw)
+{
+	if (hdr->interface_id != 'S')
+		return -EINVAL;
+	if (hdr->cmd_len > BLK_MAX_CDB)
+		return -EINVAL;
+	if (hdr->dxfer_len > (q->max_sectors << 9))
+		return -EIO;
+
+	/*
+	 * looks sane, if no data then it should be fine from our POV
+	 */
+	if (!hdr->dxfer_len)
+		return 0;
+
+	switch (hdr->dxfer_direction) {
+		case SG_DXFER_TO_FROM_DEV:
+		case SG_DXFER_FROM_DEV:
+			*rw = READ;
+			break;
+		case SG_DXFER_TO_DEV:
+			*rw = WRITE;
+			break;
+		default:
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * map sg_io_hdr to a request. for scatter-gather sg_io_hdr, we map
+ * each segment to a bio and string multiple bio's to the request
+ */
+static struct request *
+bsg_map_hdr(struct bsg_device *bd, int rw, struct sg_io_hdr *hdr)
+{
+	request_queue_t *q = bd->queue;
+	struct sg_iovec iov;
+	struct sg_iovec __user *u_iov;
+	struct request *rq;
+	int ret, i = 0;
+
+	dprintk("map hdr %p/%d/%d\n", hdr->dxferp, hdr->dxfer_len,
+					hdr->iovec_count);
+
+	ret = bsg_validate_sghdr(q, hdr, &rw);
+	if (ret)
+		return ERR_PTR(ret);
+
+	/*
+	 * map scatter-gather elements seperately and string them to request
+	 */
+	rq = blk_get_request(q, rw, GFP_KERNEL);
+	ret = blk_fill_sghdr_rq(q, rq, hdr, test_bit(BSG_F_WRITE_PERM,
+				&bd->flags));
+	if (ret) {
+		blk_put_request(rq);
+		return ERR_PTR(ret);
+	}
+
+	if (!hdr->iovec_count) {
+		ret = blk_rq_map_user(q, rq, hdr->dxferp, hdr->dxfer_len);
+		if (ret)
+			goto out;
+	}
+
+	u_iov = hdr->dxferp;
+	for (ret = 0, i = 0; i < hdr->iovec_count; i++, u_iov++) {
+		if (copy_from_user(&iov, u_iov, sizeof(iov))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		if (!iov.iov_len || !iov.iov_base) {
+			ret = -EINVAL;
+			break;
+		}
+
+		ret = blk_rq_map_user(q, rq, iov.iov_base, iov.iov_len);
+		if (ret)
+			break;
+	}
+
+	/*
+	 * bugger, cleanup
+	 */
+	if (ret) {
+out:
+		dprintk("failed map at %d: %d\n", i, ret);
+		blk_unmap_sghdr_rq(rq, hdr);
+		rq = ERR_PTR(ret);
+	}
+
+	return rq;
+}
+
+/*
+ * async completion call-back from the block layer, when scsi/ide/whatever
+ * calls end_that_request_last() on a request
+ */
+static void bsg_rq_end_io(struct request *rq, int uptodate)
+{
+	struct bsg_command *bc = rq->end_io_data;
+	struct bsg_device *bd = bc->bd;
+	unsigned long flags;
+
+	dprintk("%s: finished rq %p bio %p, bc %p offset %ld stat %d\n",
+			bd->name, rq, bc, bc->bio, bc - bd->cmd_map, uptodate);
+
+	bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration);
+
+	spin_lock_irqsave(&bd->lock, flags);
+	list_del(&bc->list);
+	bsg_add_done_cmd(bd, bc);
+	spin_unlock_irqrestore(&bd->lock, flags);
+}
+
+/*
+ * do final setup of a 'bc' and submit the matching 'rq' to the block
+ * layer for io
+ */
+static void bsg_add_command(struct bsg_device *bd, request_queue_t *q,
+			    struct bsg_command *bc, struct request *rq)
+{
+	rq->sense = bc->sense;
+	rq->sense_len = 0;
+
+	/*
+	 * add bc command to busy queue and submit rq for io
+	 */
+	bc->rq = rq;
+	bc->bio = rq->bio;
+	bc->hdr.duration = jiffies;
+	spin_lock_irq(&bd->lock);
+	list_add_tail(&bc->list, &bd->busy_list);
+	spin_unlock_irq(&bd->lock);
+
+	dprintk("%s: queueing rq %p, bc %p\n", bd->name, rq, bc);
+
+	rq->end_io_data = bc;
+	blk_execute_rq_nowait(q, bd->disk, rq, 1, bsg_rq_end_io);
+}
+
+static inline struct bsg_command *bsg_next_done_cmd(struct bsg_device *bd)
+{
+	struct bsg_command *bc = NULL;
+
+	spin_lock_irq(&bd->lock);
+	if (bd->done_cmds) {
+		bc = list_entry_bc(bd->done_list.next);
+		bsg_del_done_cmd(bd, bc);
+	}
+	spin_unlock_irq(&bd->lock);
+
+	return bc;
+}
+
+/*
+ * Get a finished command from the done list
+ */
+static struct bsg_command *__bsg_get_done_cmd(struct bsg_device *bd, int state)
+{
+	struct bsg_command *bc;
+	int ret;
+
+	do {
+		bc = bsg_next_done_cmd(bd);
+		if (bc)
+			break;
+
+		ret = bsg_io_schedule(bd, state);
+		if (ret) {
+			bc = ERR_PTR(ret);
+			break;
+		}
+	} while (1);
+
+	dprintk("%s: returning done %p\n", bd->name, bc);
+
+	return bc;
+}
+
+static struct bsg_command *
+bsg_get_done_cmd(struct bsg_device *bd, const struct iovec *iov)
+{
+	return __bsg_get_done_cmd(bd, TASK_INTERRUPTIBLE);
+}
+
+static struct bsg_command *
+bsg_get_done_cmd_nosignals(struct bsg_device *bd)
+{
+	return __bsg_get_done_cmd(bd, TASK_UNINTERRUPTIBLE);
+}
+
+static int bsg_complete_all_commands(struct bsg_device *bd)
+{
+	struct bsg_command *bc;
+	int ret, tret;
+
+	dprintk("%s: entered\n", bd->name);
+
+	set_bit(BSG_F_BLOCK, &bd->flags);
+
+	/*
+	 * wait for all commands to complete
+	 */
+	ret = 0;
+	do {
+		ret = bsg_io_schedule(bd, TASK_UNINTERRUPTIBLE);
+		/*
+		 * look for -ENODATA specifically -- we'll sometimes get
+		 * -ERESTARTSYS when we've taken a signal, but we can't
+		 * return until we're done freeing the queue, so ignore
+		 * it.  The signal will get handled when we're done freeing
+		 * the bsg_device.
+		 */
+	} while (ret != -ENODATA);
+
+	/*
+	 * discard done commands
+	 */
+	ret = 0;
+	do {
+		bc = bsg_get_done_cmd_nosignals(bd);
+
+		/*
+		 * we _must_ complete before restarting, because
+		 * bsg_release can't handle this failing.
+		 */
+		if (PTR_ERR(bc) == -ERESTARTSYS)
+			continue;
+		if (IS_ERR(bc)) {
+			ret = PTR_ERR(bc);
+			break;
+		}
+
+		tret = blk_complete_sghdr_rq(bc->rq, &bc->hdr, bc->bio);
+		if (!ret)
+			ret = tret;
+
+		bsg_free_command(bc);
+	} while (1);
+
+	return ret;
+}
+
+typedef struct bsg_command *(*bsg_command_callback)(struct bsg_device *bd, const struct iovec *iov);
+
+static ssize_t
+__bsg_read(char __user *buf, size_t count, bsg_command_callback get_bc,
+	   struct bsg_device *bd, const struct iovec *iov, ssize_t *bytes_read)
+{
+	struct bsg_command *bc;
+	int nr_commands, ret;
+
+	if (count % sizeof(struct sg_io_hdr))
+		return -EINVAL;
+
+	ret = 0;
+	nr_commands = count / sizeof(struct sg_io_hdr);
+	while (nr_commands) {
+		bc = get_bc(bd, iov);
+		if (IS_ERR(bc)) {
+			ret = PTR_ERR(bc);
+			break;
+		}
+
+		/*
+		 * this is the only case where we need to copy data back
+		 * after completing the request. so do that here,
+		 * bsg_complete_work() cannot do that for us
+		 */
+		ret = blk_complete_sghdr_rq(bc->rq, &bc->hdr, bc->bio);
+
+		if (copy_to_user(buf, (char *) &bc->hdr, sizeof(bc->hdr)))
+			ret = -EFAULT;
+
+		bsg_free_command(bc);
+
+		if (ret)
+			break;
+
+		buf += sizeof(struct sg_io_hdr);
+		*bytes_read += sizeof(struct sg_io_hdr);
+		nr_commands--;
+	}
+
+	return ret;
+}
+
+static inline void bsg_set_block(struct bsg_device *bd, struct file *file)
+{
+	if (file->f_flags & O_NONBLOCK)
+		clear_bit(BSG_F_BLOCK, &bd->flags);
+	else
+		set_bit(BSG_F_BLOCK, &bd->flags);
+}
+
+static inline void bsg_set_write_perm(struct bsg_device *bd, struct file *file)
+{
+	if (file->f_mode & FMODE_WRITE)
+		set_bit(BSG_F_WRITE_PERM, &bd->flags);
+	else
+		clear_bit(BSG_F_WRITE_PERM, &bd->flags);
+}
+
+static inline int err_block_err(int ret)
+{
+	if (ret && ret != -ENOSPC && ret != -ENODATA && ret != -EAGAIN)
+		return 1;
+
+	return 0;
+}
+
+static ssize_t
+bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct bsg_device *bd = file->private_data;
+	int ret;
+	ssize_t bytes_read;
+
+	dprintk("%s: read %lu bytes\n", bd->name, count);
+
+	bsg_set_block(bd, file);
+	bytes_read = 0;
+	ret = __bsg_read(buf, count, bsg_get_done_cmd,
+			bd, NULL, &bytes_read);
+	*ppos = bytes_read;
+
+	if (!bytes_read || (bytes_read && err_block_err(ret)))
+		bytes_read = ret;
+
+	return bytes_read;
+}
+
+static ssize_t __bsg_write(struct bsg_device *bd, const char __user *buf,
+			   size_t count, ssize_t *bytes_read)
+{
+	struct bsg_command *bc;
+	struct request *rq;
+	int ret, nr_commands;
+
+	if (count % sizeof(struct sg_io_hdr))
+		return -EINVAL;
+
+	nr_commands = count / sizeof(struct sg_io_hdr);
+	rq = NULL;
+	bc = NULL;
+	ret = 0;
+	while (nr_commands) {
+		request_queue_t *q = bd->queue;
+		int rw = READ;
+
+		bc = bsg_get_command(bd);
+		if (!bc)
+			break;
+		if (IS_ERR(bc)) {
+			ret = PTR_ERR(bc);
+			bc = NULL;
+			break;
+		}
+
+		bc->uhdr = (struct sg_io_hdr __user *) buf;
+		if (copy_from_user(&bc->hdr, buf, sizeof(bc->hdr))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		/*
+		 * get a request, fill in the blanks, and add to request queue
+		 */
+		rq = bsg_map_hdr(bd, rw, &bc->hdr);
+		if (IS_ERR(rq)) {
+			ret = PTR_ERR(rq);
+			rq = NULL;
+			break;
+		}
+
+		bsg_add_command(bd, q, bc, rq);
+		bc = NULL;
+		rq = NULL;
+		nr_commands--;
+		buf += sizeof(struct sg_io_hdr);
+		*bytes_read += sizeof(struct sg_io_hdr);
+	}
+
+	if (rq)
+		blk_unmap_sghdr_rq(rq, &bc->hdr);
+	if (bc)
+		bsg_free_command(bc);
+
+	return ret;
+}
+
+static ssize_t
+bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct bsg_device *bd = file->private_data;
+	ssize_t bytes_read;
+	int ret;
+
+	dprintk("%s: write %lu bytes\n", bd->name, count);
+
+	bsg_set_block(bd, file);
+	bsg_set_write_perm(bd, file);
+
+	bytes_read = 0;
+	ret = __bsg_write(bd, buf, count, &bytes_read);
+	*ppos = bytes_read;
+
+	/*
+	 * return bytes written on non-fatal errors
+	 */
+	if (!bytes_read || (bytes_read && err_block_err(ret)))
+		bytes_read = ret;
+
+	dprintk("%s: returning %lu\n", bd->name, bytes_read);
+	return bytes_read;
+}
+
+static void bsg_free_device(struct bsg_device *bd)
+{
+	if (bd->cmd_map)
+		free_pages((unsigned long) bd->cmd_map, BSG_CMDS_PAGE_ORDER);
+
+	kfree(bd->cmd_bitmap);
+	kfree(bd);
+}
+
+static struct bsg_device *bsg_alloc_device(void)
+{
+	struct bsg_command *cmd_map;
+	unsigned long *cmd_bitmap;
+	struct bsg_device *bd;
+	int bits;
+
+	bd = kzalloc(sizeof(struct bsg_device), GFP_KERNEL);
+	if (unlikely(!bd))
+		return NULL;
+
+	spin_lock_init(&bd->lock);
+
+	bd->max_queue = BSG_CMDS;
+
+	bits = (BSG_CMDS / BSG_CMDS_PER_LONG) + 1;
+	cmd_bitmap = kzalloc(bits * sizeof(unsigned long), GFP_KERNEL);
+	if (!cmd_bitmap)
+		goto out_free_bd;
+	bd->cmd_bitmap = cmd_bitmap;
+
+	cmd_map = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+						BSG_CMDS_PAGE_ORDER);
+	if (!cmd_map)
+		goto out_free_bitmap;
+	bd->cmd_map = cmd_map;
+
+	INIT_LIST_HEAD(&bd->busy_list);
+	INIT_LIST_HEAD(&bd->done_list);
+	INIT_HLIST_NODE(&bd->dev_list);
+
+	init_waitqueue_head(&bd->wq_free);
+	init_waitqueue_head(&bd->wq_done);
+	return bd;
+
+out_free_bitmap:
+	kfree(cmd_bitmap);
+out_free_bd:
+	kfree(bd);
+	return NULL;
+}
+
+static int bsg_put_device(struct bsg_device *bd)
+{
+	int ret = 0;
+
+	mutex_lock(&bsg_mutex);
+
+	if (!atomic_dec_and_test(&bd->ref_count))
+		goto out;
+
+	dprintk("%s: tearing down\n", bd->name);
+
+	/*
+	 * close can always block
+	 */
+	set_bit(BSG_F_BLOCK, &bd->flags);
+
+	/*
+	 * correct error detection baddies here again. it's the responsibility
+	 * of the app to properly reap commands before close() if it wants
+	 * fool-proof error detection
+	 */
+	ret = bsg_complete_all_commands(bd);
+
+	blk_put_queue(bd->queue);
+	hlist_del(&bd->dev_list);
+	bsg_free_device(bd);
+out:
+	mutex_unlock(&bsg_mutex);
+	return ret;
+}
+
+static struct bsg_device *bsg_add_device(struct inode *inode,
+					 struct gendisk *disk,
+					 struct file *file)
+{
+	struct bsg_device *bd = NULL;
+#ifdef BSG_DEBUG
+	unsigned char buf[32];
+#endif
+
+	bd = bsg_alloc_device();
+	if (!bd)
+		return ERR_PTR(-ENOMEM);
+
+	bd->disk = disk;
+	bd->queue = disk->queue;
+	kobject_get(&disk->queue->kobj);
+	bsg_set_block(bd, file);
+
+	atomic_set(&bd->ref_count, 1);
+	bd->minor = iminor(inode);
+	mutex_lock(&bsg_mutex);
+	hlist_add_head(&bd->dev_list,&bsg_device_list[bsg_list_idx(bd->minor)]);
+
+	strncpy(bd->name, disk->disk_name, sizeof(bd->name) - 1);
+	dprintk("bound to <%s>, max queue %d\n",
+		format_dev_t(buf, i->i_rdev), bd->max_queue);
+
+	mutex_unlock(&bsg_mutex);
+	return bd;
+}
+
+static struct bsg_device *__bsg_get_device(int minor)
+{
+	struct hlist_head *list = &bsg_device_list[bsg_list_idx(minor)];
+	struct bsg_device *bd = NULL;
+	struct hlist_node *entry;
+
+	mutex_lock(&bsg_mutex);
+
+	hlist_for_each(entry, list) {
+		bd = hlist_entry(entry, struct bsg_device, dev_list);
+		if (bd->minor == minor) {
+			atomic_inc(&bd->ref_count);
+			break;
+		}
+
+		bd = NULL;
+	}
+
+	mutex_unlock(&bsg_mutex);
+	return bd;
+}
+
+static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file)
+{
+	struct bsg_device *bd = __bsg_get_device(iminor(inode));
+	struct bsg_class_device *bcd, *__bcd;
+
+	if (bd)
+		return bd;
+
+	/*
+	 * find the class device
+	 */
+	bcd = NULL;
+	mutex_lock(&bsg_mutex);
+	list_for_each_entry(__bcd, &bsg_class_list, list) {
+		if (__bcd->minor == iminor(inode)) {
+			bcd = __bcd;
+			break;
+		}
+	}
+	mutex_unlock(&bsg_mutex);
+
+	if (!bcd)
+		return ERR_PTR(-ENODEV);
+
+	return bsg_add_device(inode, bcd->disk, file);
+}
+
+static int bsg_open(struct inode *inode, struct file *file)
+{
+	struct bsg_device *bd = bsg_get_device(inode, file);
+
+	if (IS_ERR(bd))
+		return PTR_ERR(bd);
+
+	file->private_data = bd;
+	return 0;
+}
+
+static int bsg_release(struct inode *inode, struct file *file)
+{
+	struct bsg_device *bd = file->private_data;
+
+	file->private_data = NULL;
+	return bsg_put_device(bd);
+}
+
+static unsigned int bsg_poll(struct file *file, poll_table *wait)
+{
+	struct bsg_device *bd = file->private_data;
+	unsigned int mask = 0;
+
+	poll_wait(file, &bd->wq_done, wait);
+	poll_wait(file, &bd->wq_free, wait);
+
+	spin_lock_irq(&bd->lock);
+	if (!list_empty(&bd->done_list))
+		mask |= POLLIN | POLLRDNORM;
+	if (bd->queued_cmds >= bd->max_queue)
+		mask |= POLLOUT;
+	spin_unlock_irq(&bd->lock);
+
+	return mask;
+}
+
+static int
+bsg_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+	  unsigned long arg)
+{
+	struct bsg_device *bd = file->private_data;
+	int __user *uarg = (int __user *) arg;
+
+	if (!bd)
+		return -ENXIO;
+
+	switch (cmd) {
+		/*
+		 * our own ioctls
+		 */
+	case SG_GET_COMMAND_Q:
+		return put_user(bd->max_queue, uarg);
+		case SG_SET_COMMAND_Q: {
+		int queue;
+
+		if (get_user(queue, uarg))
+			return -EFAULT;
+		if (queue > BSG_CMDS || queue < 1)
+			return -EINVAL;
+
+		bd->max_queue = queue;
+		return 0;
+	}
+
+	/*
+	 * SCSI/sg ioctls
+	 */
+	case SG_GET_VERSION_NUM:
+	case SCSI_IOCTL_GET_IDLUN:
+	case SCSI_IOCTL_GET_BUS_NUMBER:
+	case SG_SET_TIMEOUT:
+	case SG_GET_TIMEOUT:
+	case SG_GET_RESERVED_SIZE:
+	case SG_SET_RESERVED_SIZE:
+	case SG_EMULATED_HOST:
+	case SG_IO:
+	case SCSI_IOCTL_SEND_COMMAND: {
+		void __user *uarg = (void __user *) arg;
+		return scsi_cmd_ioctl(file, bd->disk, cmd, uarg);
+	}
+	/*
+	 * block device ioctls
+	 */
+	default:
+#if 0
+		return ioctl_by_bdev(bd->bdev, cmd, arg);
+#else
+		return -ENOTTY;
+#endif
+	}
+}
+
+static struct file_operations bsg_fops = {
+	.read		=	bsg_read,
+	.write		=	bsg_write,
+	.poll		=	bsg_poll,
+	.open		=	bsg_open,
+	.release	=	bsg_release,
+	.ioctl		=	bsg_ioctl,
+	.owner		=	THIS_MODULE,
+};
+
+void bsg_unregister_disk(struct gendisk *disk)
+{
+	struct bsg_class_device *bcd = &disk->bsg_dev;
+
+	if (!bcd->class_dev)
+		return;
+
+	mutex_lock(&bsg_mutex);
+	sysfs_remove_link(&bcd->disk->queue->kobj, "bsg");
+	class_device_destroy(bsg_class, MKDEV(BSG_MAJOR, bcd->minor));
+	bcd->class_dev = NULL;
+	list_del_init(&bcd->list);
+	mutex_unlock(&bsg_mutex);
+}
+
+int bsg_register_disk(struct gendisk *disk)
+{
+	request_queue_t *q = disk->queue;
+	struct bsg_class_device *bcd;
+	dev_t dev;
+
+	/*
+	 * we need a proper transport to send commands, not a stacked device
+	 */
+	if (!q->request_fn)
+		return 0;
+
+	bcd = &disk->bsg_dev;
+	memset(bcd, 0, sizeof(*bcd));
+	INIT_LIST_HEAD(&bcd->list);
+
+	mutex_lock(&bsg_mutex);
+	dev = MKDEV(BSG_MAJOR, bsg_device_nr);
+	bcd->minor = bsg_device_nr;
+	bsg_device_nr++;
+	bcd->disk = disk;
+	bcd->class_dev = class_device_create(bsg_class, NULL, dev, bcd->dev, "%s", disk->disk_name);
+	list_add_tail(&bcd->list, &bsg_class_list);
+	sysfs_create_link(&q->kobj, &bcd->class_dev->kobj, "bsg");
+	mutex_unlock(&bsg_mutex);
+	return 0;
+}
+
+static int __init bsg_init(void)
+{
+	int ret, i;
+
+	for (i = 0; i < BSG_LIST_SIZE; i++)
+		INIT_HLIST_HEAD(&bsg_device_list[i]);
+
+	bsg_class = class_create(THIS_MODULE, "bsg");
+	if (IS_ERR(bsg_class))
+		return PTR_ERR(bsg_class);
+
+	ret = register_chrdev(BSG_MAJOR, "bsg", &bsg_fops);
+	if (ret) {
+		class_destroy(bsg_class);
+		return ret;
+	}
+
+	printk(KERN_INFO "%s loaded\n", bsg_version);
+	return 0;
+}
+
+MODULE_AUTHOR("Jens Axboe");
+MODULE_DESCRIPTION("Block layer SGSI generic (sg) driver");
+MODULE_LICENSE("GPL");
+
+subsys_initcall(bsg_init);
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index ef42bb2b12b6..3795e0708a22 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -4091,6 +4091,13 @@ int blk_register_queue(struct gendisk *disk)
 		return ret;
 	}
 
+	ret = bsg_register_disk(disk);
+	if (ret) {
+		elv_unregister_queue(q);
+		kobject_unregister(&q->kobj);
+		return ret;
+	}
+
 	return 0;
 }
 
@@ -4099,6 +4106,7 @@ void blk_unregister_queue(struct gendisk *disk)
 	request_queue_t *q = disk->queue;
 
 	if (q && q->request_fn) {
+		bsg_unregister_disk(disk);
 		elv_unregister_queue(q);
 
 		kobject_uevent(&q->kobj, KOBJ_REMOVE);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index e83f1dbf7c29..88fd008d38bd 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -41,8 +41,6 @@ const unsigned char scsi_command_size[8] =
 
 EXPORT_SYMBOL(scsi_command_size);
 
-#define BLK_DEFAULT_TIMEOUT	(60 * HZ)
-
 #include <scsi/sg.h>
 
 static int sg_get_version(int __user *p)
@@ -114,7 +112,7 @@ static int sg_emulated_host(request_queue_t *q, int __user *p)
 #define safe_for_read(cmd)	[cmd] = CMD_READ_SAFE
 #define safe_for_write(cmd)	[cmd] = CMD_WRITE_SAFE
 
-static int verify_command(struct file *file, unsigned char *cmd)
+static int verify_command(unsigned char *cmd, int has_write_perm)
 {
 	static unsigned char cmd_type[256] = {
 
@@ -193,18 +191,11 @@ static int verify_command(struct file *file, unsigned char *cmd)
 		safe_for_write(GPCMD_SET_STREAMING),
 	};
 	unsigned char type = cmd_type[cmd[0]];
-	int has_write_perm = 0;
 
 	/* Anybody who can open the device can do a read-safe command */
 	if (type & CMD_READ_SAFE)
 		return 0;
 
-	/*
-	 * file can be NULL from ioctl_by_bdev()...
-	 */
-	if (file)
-		has_write_perm = file->f_mode & FMODE_WRITE;
-
 	/* Write-safe commands just require a writable open.. */
 	if ((type & CMD_WRITE_SAFE) && has_write_perm)
 		return 0;
@@ -222,24 +213,104 @@ static int verify_command(struct file *file, unsigned char *cmd)
 	return -EPERM;
 }
 
+int blk_fill_sghdr_rq(request_queue_t *q, struct request *rq,
+		      struct sg_io_hdr *hdr, int has_write_perm)
+{
+	memset(rq->cmd, 0, BLK_MAX_CDB); /* ATAPI hates garbage after CDB */
+
+	if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len))
+		return -EFAULT;
+	if (verify_command(rq->cmd, has_write_perm))
+		return -EPERM;
+
+	/*
+	 * fill in request structure
+	 */
+	rq->cmd_len = hdr->cmd_len;
+	rq->cmd_type = REQ_TYPE_BLOCK_PC;
+
+	rq->timeout = (hdr->timeout * HZ) / 1000;
+	if (!rq->timeout)
+		rq->timeout = q->sg_timeout;
+	if (!rq->timeout)
+		rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blk_fill_sghdr_rq);
+
+/*
+ * unmap a request that was previously mapped to this sg_io_hdr. handles
+ * both sg and non-sg sg_io_hdr.
+ */
+int blk_unmap_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr)
+{
+	struct bio *bio = rq->bio;
+
+	/*
+	 * also releases request
+	 */
+	if (!hdr->iovec_count)
+		return blk_rq_unmap_user(bio, hdr->dxfer_len);
+
+	rq_for_each_bio(bio, rq)
+		bio_unmap_user(bio);
+
+	blk_put_request(rq);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blk_unmap_sghdr_rq);
+
+int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
+			  struct bio *bio)
+{
+	int r, ret = 0;
+
+	/*
+	 * fill in all the output members
+	 */
+	hdr->status = rq->errors & 0xff;
+	hdr->masked_status = status_byte(rq->errors);
+	hdr->msg_status = msg_byte(rq->errors);
+	hdr->host_status = host_byte(rq->errors);
+	hdr->driver_status = driver_byte(rq->errors);
+	hdr->info = 0;
+	if (hdr->masked_status || hdr->host_status || hdr->driver_status)
+		hdr->info |= SG_INFO_CHECK;
+	hdr->resid = rq->data_len;
+	hdr->sb_len_wr = 0;
+
+	if (rq->sense_len && hdr->sbp) {
+		int len = min((unsigned int) hdr->mx_sb_len, rq->sense_len);
+
+		if (!copy_to_user(hdr->sbp, rq->sense, len))
+			hdr->sb_len_wr = len;
+		else
+			ret = -EFAULT;
+	}
+
+	rq->bio = bio;
+	r = blk_unmap_sghdr_rq(rq, hdr);
+	if (ret)
+		r = ret;
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(blk_complete_sghdr_rq);
+
 static int sg_io(struct file *file, request_queue_t *q,
 		struct gendisk *bd_disk, struct sg_io_hdr *hdr)
 {
-	unsigned long start_time, timeout;
-	int writing = 0, ret = 0;
+	unsigned long start_time;
+	int writing = 0, ret = 0, has_write_perm = 0;
 	struct request *rq;
 	char sense[SCSI_SENSE_BUFFERSIZE];
-	unsigned char cmd[BLK_MAX_CDB];
 	struct bio *bio;
 
 	if (hdr->interface_id != 'S')
 		return -EINVAL;
 	if (hdr->cmd_len > BLK_MAX_CDB)
 		return -EINVAL;
-	if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len))
-		return -EFAULT;
-	if (verify_command(file, cmd))
-		return -EPERM;
 
 	if (hdr->dxfer_len > (q->max_hw_sectors << 9))
 		return -EIO;
@@ -260,25 +331,14 @@ static int sg_io(struct file *file, request_queue_t *q,
 	if (!rq)
 		return -ENOMEM;
 
-	/*
-	 * fill in request structure
-	 */
-	rq->cmd_len = hdr->cmd_len;
-	memset(rq->cmd, 0, BLK_MAX_CDB); /* ATAPI hates garbage after CDB */
-	memcpy(rq->cmd, cmd, hdr->cmd_len);
-
-	memset(sense, 0, sizeof(sense));
-	rq->sense = sense;
-	rq->sense_len = 0;
-
-	rq->cmd_type = REQ_TYPE_BLOCK_PC;
+	if (file)
+		has_write_perm = file->f_mode & FMODE_WRITE;
 
-	timeout = msecs_to_jiffies(hdr->timeout);
-	rq->timeout = (timeout < INT_MAX) ? timeout : INT_MAX;
-	if (!rq->timeout)
-		rq->timeout = q->sg_timeout;
-	if (!rq->timeout)
-		rq->timeout = BLK_DEFAULT_TIMEOUT;
+	if (blk_fill_sghdr_rq(q, rq, hdr, has_write_perm)) {
+		blk_rq_unmap_user(bio, hdr->dxfer_len);
+		blk_put_request(rq);
+		return -EFAULT;
+	}
 
 	if (hdr->iovec_count) {
 		const int size = sizeof(struct sg_iovec) * hdr->iovec_count;
@@ -306,6 +366,9 @@ static int sg_io(struct file *file, request_queue_t *q,
 		goto out;
 
 	bio = rq->bio;
+	memset(sense, 0, sizeof(sense));
+	rq->sense = sense;
+	rq->sense_len = 0;
 	rq->retries = 0;
 
 	start_time = jiffies;
@@ -316,31 +379,9 @@ static int sg_io(struct file *file, request_queue_t *q,
 	 */
 	blk_execute_rq(q, bd_disk, rq, 0);
 
-	/* write to all output members */
-	hdr->status = 0xff & rq->errors;
-	hdr->masked_status = status_byte(rq->errors);
-	hdr->msg_status = msg_byte(rq->errors);
-	hdr->host_status = host_byte(rq->errors);
-	hdr->driver_status = driver_byte(rq->errors);
-	hdr->info = 0;
-	if (hdr->masked_status || hdr->host_status || hdr->driver_status)
-		hdr->info |= SG_INFO_CHECK;
-	hdr->resid = rq->data_len;
 	hdr->duration = ((jiffies - start_time) * 1000) / HZ;
-	hdr->sb_len_wr = 0;
-
-	if (rq->sense_len && hdr->sbp) {
-		int len = min((unsigned int) hdr->mx_sb_len, rq->sense_len);
-
-		if (!copy_to_user(hdr->sbp, rq->sense, len))
-			hdr->sb_len_wr = len;
-	}
-
-	if (blk_rq_unmap_user(bio))
-		ret = -EFAULT;
 
-	/* may not have succeeded, but output values written to control
-	 * structure (struct sg_io_hdr).  */
+	return blk_complete_sghdr_rq(rq, hdr, bio);
 out:
 	blk_put_request(rq);
 	return ret;
@@ -427,7 +468,7 @@ int sg_scsi_ioctl(struct file *file, struct request_queue *q,
 	if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
 		goto error;
 
-	err = verify_command(file, rq->cmd);
+	err = verify_command(rq->cmd, file->f_mode & FMODE_WRITE);
 	if (err)
 		goto error;
 
@@ -454,7 +495,7 @@ int sg_scsi_ioctl(struct file *file, struct request_queue *q,
 		rq->retries = 1;
 		break;
 	default:
-		rq->timeout = BLK_DEFAULT_TIMEOUT;
+		rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
 		break;
 	}
 
@@ -501,7 +542,7 @@ static int __blk_send_generic(request_queue_t *q, struct gendisk *bd_disk, int c
 	rq->cmd_type = REQ_TYPE_BLOCK_PC;
 	rq->data = NULL;
 	rq->data_len = 0;
-	rq->timeout = BLK_DEFAULT_TIMEOUT;
+	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
 	memset(rq->cmd, 0, sizeof(rq->cmd));
 	rq->cmd[0] = cmd;
 	rq->cmd[4] = data;
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index f429be88c4f9..a21f585b1caa 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -1258,19 +1258,25 @@ static void idefloppy_create_rw_cmd (idefloppy_floppy_t *floppy, idefloppy_pc_t
 	set_bit(PC_DMA_RECOMMENDED, &pc->flags);
 }
 
-static int
+static void
 idefloppy_blockpc_cmd(idefloppy_floppy_t *floppy, idefloppy_pc_t *pc, struct request *rq)
 {
-	/*
-	 * just support eject for now, it would not be hard to make the
-	 * REQ_BLOCK_PC support fully-featured
-	 */
-	if (rq->cmd[0] != IDEFLOPPY_START_STOP_CMD)
-		return 1;
-
 	idefloppy_init_pc(pc);
+	pc->callback = &idefloppy_rw_callback;
 	memcpy(pc->c, rq->cmd, sizeof(pc->c));
-	return 0;
+	pc->rq = rq;
+	pc->b_count = rq->data_len;
+	if (rq->data_len && rq_data_dir(rq) == WRITE)
+		set_bit(PC_WRITING, &pc->flags);
+	pc->buffer = rq->data;
+	if (rq->bio)
+		set_bit(PC_DMA_RECOMMENDED, &pc->flags);
+		
+	/*
+	 * possibly problematic, doesn't look like ide-floppy correctly
+	 * handled scattered requests if dma fails...
+	 */
+	pc->request_transfer = pc->buffer_size = rq->data_len;
 }
 
 /*
@@ -1317,10 +1323,7 @@ static ide_startstop_t idefloppy_do_request (ide_drive_t *drive, struct request
 		pc = (idefloppy_pc_t *) rq->buffer;
 	} else if (blk_pc_request(rq)) {
 		pc = idefloppy_next_pc_storage(drive);
-		if (idefloppy_blockpc_cmd(floppy, pc, rq)) {
-			idefloppy_do_end_request(drive, 0, 0);
-			return ide_stopped;
-		}
+		idefloppy_blockpc_cmd(floppy, pc, rq);
 	} else {
 		blk_dump_rq_flags(rq,
 			"ide-floppy: unsupported command in queue");
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index c948a5c17a5d..9ae60a7400a2 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -1049,9 +1049,13 @@ int generic_ide_ioctl(ide_drive_t *drive, struct file *file, struct block_device
 	unsigned long flags;
 	ide_driver_t *drv;
 	void __user *p = (void __user *)arg;
-	int err = 0, (*setfunc)(ide_drive_t *, int);
+	int err, (*setfunc)(ide_drive_t *, int);
 	u8 *val;
 
+	err = scsi_cmd_ioctl(file, bdev->bd_disk, cmd, p);
+	if (err != -ENOTTY)
+		return err;
+
 	switch (cmd) {
 	case HDIO_GET_32BIT:	    val = &drive->io_32bit;	 goto read_val;
 	case HDIO_GET_KEEPSETTINGS: val = &drive->keep_settings; goto read_val;
@@ -1171,10 +1175,6 @@ int generic_ide_ioctl(ide_drive_t *drive, struct file *file, struct block_device
 			return 0;
 		}
 
-		case CDROMEJECT:
-		case CDROMCLOSETRAY:
-			return scsi_cmd_ioctl(file, bdev->bd_disk, cmd, p);
-
 		case HDIO_GET_BUSSTATE:
 			if (!capable(CAP_SYS_ADMIN))
 				return -EACCES;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fae138bd2207..53002d40efa2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -41,6 +41,8 @@ struct elevator_queue;
 typedef struct elevator_queue elevator_t;
 struct request_pm_state;
 struct blk_trace;
+struct request;
+struct sg_io_hdr;
 
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
@@ -607,6 +609,11 @@ extern unsigned long blk_max_low_pfn, blk_max_pfn;
 #define BLK_BOUNCE_ANY		((u64)blk_max_pfn << PAGE_SHIFT)
 #define BLK_BOUNCE_ISA		(ISA_DMA_THRESHOLD)
 
+/*
+ * default timeout for SG_IO if none specified
+ */
+#define BLK_DEFAULT_SG_TIMEOUT	(60 * HZ)
+
 #ifdef CONFIG_MMU
 extern int init_emergency_isa_pool(void);
 extern void blk_queue_bounce(request_queue_t *q, struct bio **bio);
@@ -680,6 +687,11 @@ extern int blk_execute_rq(request_queue_t *, struct gendisk *,
 			  struct request *, int);
 extern void blk_execute_rq_nowait(request_queue_t *, struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
+extern int blk_fill_sghdr_rq(request_queue_t *, struct request *,
+		      struct sg_io_hdr *, int);
+extern int blk_unmap_sghdr_rq(struct request *, struct sg_io_hdr *);
+extern int blk_complete_sghdr_rq(struct request *, struct sg_io_hdr *,
+			  struct bio *);
 
 static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
 {
diff --git a/include/linux/bsg.h b/include/linux/bsg.h
new file mode 100644
index 000000000000..dc0d7282c4cb
--- /dev/null
+++ b/include/linux/bsg.h
@@ -0,0 +1,21 @@
+#ifndef BSG_H
+#define BSG_H
+
+#if defined(CONFIG_BLK_DEV_BSG)
+struct bsg_class_device {
+	struct class_device *class_dev;
+	struct device *dev;
+	int minor;
+	struct gendisk *disk;
+	struct list_head list;
+};
+
+extern int bsg_register_disk(struct gendisk *);
+extern void bsg_unregister_disk(struct gendisk *);
+#else
+struct bsg_class_device { };
+#define bsg_register_disk(disk)		(0)
+#define bsg_unregister_disk(disk)	do { } while (0)
+#endif
+
+#endif
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 9756fc102a83..8c43d7032612 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -67,6 +67,7 @@ struct partition {
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/workqueue.h>
+#include <linux/bsg.h>
 
 struct partition {
 	unsigned char boot_ind;		/* 0x80 - active */
@@ -91,6 +92,7 @@ struct hd_struct {
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	int make_it_fail;
 #endif
+	struct bsg_class_device bsg_dev;
 };
 
 #define GENHD_FL_REMOVABLE			1
-- 
cgit v1.3-8-gc7d7


From 337ad41deae1b56e56731246322a93251df86e79 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Wed, 20 Dec 2006 11:18:54 +0100
Subject: block: export blk_verify_command for SG v4

blk_fill_sghdr_rq doesn't work for SG v4 so verify_command needed to
be exported.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/scsi_ioctl.c     | 7 ++++---
 include/linux/blkdev.h | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index daded70ffbb1..db53b2c268d3 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -112,7 +112,7 @@ static int sg_emulated_host(request_queue_t *q, int __user *p)
 #define safe_for_read(cmd)	[cmd] = CMD_READ_SAFE
 #define safe_for_write(cmd)	[cmd] = CMD_WRITE_SAFE
 
-static int verify_command(unsigned char *cmd, int has_write_perm)
+int blk_verify_command(unsigned char *cmd, int has_write_perm)
 {
 	static unsigned char cmd_type[256] = {
 
@@ -212,6 +212,7 @@ static int verify_command(unsigned char *cmd, int has_write_perm)
 	/* Otherwise fail it with an "Operation not permitted" */
 	return -EPERM;
 }
+EXPORT_SYMBOL_GPL(blk_verify_command);
 
 int blk_fill_sghdr_rq(request_queue_t *q, struct request *rq,
 		      struct sg_io_hdr *hdr, int has_write_perm)
@@ -220,7 +221,7 @@ int blk_fill_sghdr_rq(request_queue_t *q, struct request *rq,
 
 	if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len))
 		return -EFAULT;
-	if (verify_command(rq->cmd, has_write_perm))
+	if (blk_verify_command(rq->cmd, has_write_perm))
 		return -EPERM;
 
 	/*
@@ -457,7 +458,7 @@ int sg_scsi_ioctl(struct file *file, struct request_queue *q,
 	if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
 		goto error;
 
-	err = verify_command(rq->cmd, file->f_mode & FMODE_WRITE);
+	err = blk_verify_command(rq->cmd, file->f_mode & FMODE_WRITE);
 	if (err)
 		goto error;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 53002d40efa2..f6bc0d03ffad 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -692,6 +692,7 @@ extern int blk_fill_sghdr_rq(request_queue_t *, struct request *,
 extern int blk_unmap_sghdr_rq(struct request *, struct sg_io_hdr *);
 extern int blk_complete_sghdr_rq(struct request *, struct sg_io_hdr *,
 			  struct bio *);
+extern int blk_verify_command(unsigned char *, int);
 
 static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
 {
-- 
cgit v1.3-8-gc7d7


From 45977d0e87ac988d04fccfb89221727aaf8d78a4 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Wed, 20 Dec 2006 11:19:32 +0100
Subject: bsg: add sg_io_v4 structure

This patch adds sg_io_v4 structure that Doug proposed last month.

There's one major change from the RFC. I dropped iovec, which needs
compat stuff. The bsg code simply calls blk_rq_map_user against
dout_xferp/din_xferp. So if possible, the page frames are directly
mapped. If not possible, the block layer allocates new page frames and
does memory copies.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/bsg.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bsg.h b/include/linux/bsg.h
index dc0d7282c4cb..0d212cc06abf 100644
--- a/include/linux/bsg.h
+++ b/include/linux/bsg.h
@@ -1,6 +1,47 @@
 #ifndef BSG_H
 #define BSG_H
 
+struct sg_io_v4 {
+	int32_t guard;		/* [i] 'Q' to differentiate from v3 */
+	uint32_t protocol;	/* [i] 0 -> SCSI , .... */
+	uint32_t subprotocol;	/* [i] 0 -> SCSI command, 1 -> SCSI task
+				   management function, .... */
+
+	uint32_t request_len;	/* [i] in bytes */
+	uint64_t request;	/* [i], [*i] {SCSI: cdb} */
+	uint32_t request_attr;	/* [i] {SCSI: task attribute} */
+	uint32_t request_tag;	/* [i] {SCSI: task tag (only if flagged)} */
+	uint32_t request_priority;	/* [i] {SCSI: task priority} */
+	uint32_t max_response_len;	/* [i] in bytes */
+	uint64_t response;	/* [i], [*o] {SCSI: (auto)sense data} */
+
+	/* "din_" for data in (from device); "dout_" for data out (to device) */
+	uint32_t dout_xfer_len;	/* [i] bytes to be transferred to device */
+	uint32_t din_xfer_len;	/* [i] bytes to be transferred from device */
+	uint64_t dout_xferp;	/* [i], [*i] */
+	uint64_t din_xferp;	/* [i], [*o] */
+
+	uint32_t timeout;	/* [i] units: millisecond */
+	uint32_t flags;		/* [i] bit mask */
+	uint64_t usr_ptr;	/* [i->o] unused internally */
+	uint32_t spare_in;	/* [i] */
+
+	uint32_t driver_status;	/* [o] 0 -> ok */
+	uint32_t transport_status;	/* [o] 0 -> ok */
+	uint32_t device_status;	/* [o] {SCSI: command completion status} */
+	uint32_t retry_delay;	/* [o] {SCSI: status auxiliary information} */
+	uint32_t info;		/* [o] additional information */
+	uint32_t duration;	/* [o] time to complete, in milliseconds */
+	uint32_t response_len;	/* [o] bytes of response actually written */
+	int32_t din_resid;	/* [o] actual_din_xfer_len - din_xfer_len */
+	uint32_t generated_tag;	/* [o] {SCSI: task tag that transport chose} */
+	uint32_t spare_out;	/* [o] */
+
+	uint32_t padding;
+};
+
+#ifdef __KERNEL__
+
 #if defined(CONFIG_BLK_DEV_BSG)
 struct bsg_class_device {
 	struct class_device *class_dev;
@@ -18,4 +59,6 @@ struct bsg_class_device { };
 #define bsg_unregister_disk(disk)	do { } while (0)
 #endif
 
+#endif /* __KERNEL__ */
+
 #endif
-- 
cgit v1.3-8-gc7d7


From 1594a3f0eb526c73bc3915e8da13f2abf0ea1acd Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 20 Dec 2006 11:23:35 +0100
Subject: bsg: use u32 etc instead of uint32_t

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/bsg.h | 58 ++++++++++++++++++++++++++---------------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bsg.h b/include/linux/bsg.h
index 0d212cc06abf..f968726cfadc 100644
--- a/include/linux/bsg.h
+++ b/include/linux/bsg.h
@@ -2,42 +2,42 @@
 #define BSG_H
 
 struct sg_io_v4 {
-	int32_t guard;		/* [i] 'Q' to differentiate from v3 */
-	uint32_t protocol;	/* [i] 0 -> SCSI , .... */
-	uint32_t subprotocol;	/* [i] 0 -> SCSI command, 1 -> SCSI task
+	s32 guard;		/* [i] 'Q' to differentiate from v3 */
+	u32 protocol;		/* [i] 0 -> SCSI , .... */
+	u32 subprotocol;	/* [i] 0 -> SCSI command, 1 -> SCSI task
 				   management function, .... */
 
-	uint32_t request_len;	/* [i] in bytes */
-	uint64_t request;	/* [i], [*i] {SCSI: cdb} */
-	uint32_t request_attr;	/* [i] {SCSI: task attribute} */
-	uint32_t request_tag;	/* [i] {SCSI: task tag (only if flagged)} */
-	uint32_t request_priority;	/* [i] {SCSI: task priority} */
-	uint32_t max_response_len;	/* [i] in bytes */
-	uint64_t response;	/* [i], [*o] {SCSI: (auto)sense data} */
+	u32 request_len;	/* [i] in bytes */
+	u64 request;		/* [i], [*i] {SCSI: cdb} */
+	u32 request_attr;	/* [i] {SCSI: task attribute} */
+	u32 request_tag;	/* [i] {SCSI: task tag (only if flagged)} */
+	u32 request_priority;	/* [i] {SCSI: task priority} */
+	u32 max_response_len;	/* [i] in bytes */
+	u64 response;		/* [i], [*o] {SCSI: (auto)sense data} */
 
 	/* "din_" for data in (from device); "dout_" for data out (to device) */
-	uint32_t dout_xfer_len;	/* [i] bytes to be transferred to device */
-	uint32_t din_xfer_len;	/* [i] bytes to be transferred from device */
-	uint64_t dout_xferp;	/* [i], [*i] */
-	uint64_t din_xferp;	/* [i], [*o] */
+	u32 dout_xfer_len;	/* [i] bytes to be transferred to device */
+	u32 din_xfer_len;	/* [i] bytes to be transferred from device */
+	u64 dout_xferp;		/* [i], [*i] */
+	u64 din_xferp;		/* [i], [*o] */
 
-	uint32_t timeout;	/* [i] units: millisecond */
-	uint32_t flags;		/* [i] bit mask */
-	uint64_t usr_ptr;	/* [i->o] unused internally */
-	uint32_t spare_in;	/* [i] */
+	u32 timeout;		/* [i] units: millisecond */
+	u32 flags;		/* [i] bit mask */
+	u64 usr_ptr;		/* [i->o] unused internally */
+	u32 spare_in;		/* [i] */
 
-	uint32_t driver_status;	/* [o] 0 -> ok */
-	uint32_t transport_status;	/* [o] 0 -> ok */
-	uint32_t device_status;	/* [o] {SCSI: command completion status} */
-	uint32_t retry_delay;	/* [o] {SCSI: status auxiliary information} */
-	uint32_t info;		/* [o] additional information */
-	uint32_t duration;	/* [o] time to complete, in milliseconds */
-	uint32_t response_len;	/* [o] bytes of response actually written */
-	int32_t din_resid;	/* [o] actual_din_xfer_len - din_xfer_len */
-	uint32_t generated_tag;	/* [o] {SCSI: task tag that transport chose} */
-	uint32_t spare_out;	/* [o] */
+	u32 driver_status;	/* [o] 0 -> ok */
+	u32 transport_status;	/* [o] 0 -> ok */
+	u32 device_status;	/* [o] {SCSI: command completion status} */
+	u32 retry_delay;	/* [o] {SCSI: status auxiliary information} */
+	u32 info;		/* [o] additional information */
+	u32 duration;		/* [o] time to complete, in milliseconds */
+	u32 response_len;	/* [o] bytes of response actually written */
+	s32 din_resid;		/* [o] actual_din_xfer_len - din_xfer_len */
+	u32 generated_tag;	/* [o] {SCSI: task tag that transport chose} */
+	u32 spare_out;		/* [o] */
 
-	uint32_t padding;
+	u32 padding;
 };
 
 #ifdef __KERNEL__
-- 
cgit v1.3-8-gc7d7


From 3862153b673516b2efa0447b9b3778f47ac8f8c8 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Fri, 22 Dec 2006 09:43:51 +0100
Subject: Replace s32, u32 and u64 with __s32, __u32 and __u64 in bsg.h for
 userspace

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/bsg.h | 58 ++++++++++++++++++++++++++---------------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bsg.h b/include/linux/bsg.h
index f968726cfadc..2154a6dfbd53 100644
--- a/include/linux/bsg.h
+++ b/include/linux/bsg.h
@@ -2,42 +2,42 @@
 #define BSG_H
 
 struct sg_io_v4 {
-	s32 guard;		/* [i] 'Q' to differentiate from v3 */
-	u32 protocol;		/* [i] 0 -> SCSI , .... */
-	u32 subprotocol;	/* [i] 0 -> SCSI command, 1 -> SCSI task
+	__s32 guard;		/* [i] 'Q' to differentiate from v3 */
+	__u32 protocol;		/* [i] 0 -> SCSI , .... */
+	__u32 subprotocol;	/* [i] 0 -> SCSI command, 1 -> SCSI task
 				   management function, .... */
 
-	u32 request_len;	/* [i] in bytes */
-	u64 request;		/* [i], [*i] {SCSI: cdb} */
-	u32 request_attr;	/* [i] {SCSI: task attribute} */
-	u32 request_tag;	/* [i] {SCSI: task tag (only if flagged)} */
-	u32 request_priority;	/* [i] {SCSI: task priority} */
-	u32 max_response_len;	/* [i] in bytes */
-	u64 response;		/* [i], [*o] {SCSI: (auto)sense data} */
+	__u32 request_len;	/* [i] in bytes */
+	__u64 request;		/* [i], [*i] {SCSI: cdb} */
+	__u32 request_attr;	/* [i] {SCSI: task attribute} */
+	__u32 request_tag;	/* [i] {SCSI: task tag (only if flagged)} */
+	__u32 request_priority;	/* [i] {SCSI: task priority} */
+	__u32 max_response_len;	/* [i] in bytes */
+	__u64 response;		/* [i], [*o] {SCSI: (auto)sense data} */
 
 	/* "din_" for data in (from device); "dout_" for data out (to device) */
-	u32 dout_xfer_len;	/* [i] bytes to be transferred to device */
-	u32 din_xfer_len;	/* [i] bytes to be transferred from device */
-	u64 dout_xferp;		/* [i], [*i] */
-	u64 din_xferp;		/* [i], [*o] */
+	__u32 dout_xfer_len;	/* [i] bytes to be transferred to device */
+	__u32 din_xfer_len;	/* [i] bytes to be transferred from device */
+	__u64 dout_xferp;	/* [i], [*i] */
+	__u64 din_xferp;	/* [i], [*o] */
 
-	u32 timeout;		/* [i] units: millisecond */
-	u32 flags;		/* [i] bit mask */
-	u64 usr_ptr;		/* [i->o] unused internally */
-	u32 spare_in;		/* [i] */
+	__u32 timeout;		/* [i] units: millisecond */
+	__u32 flags;		/* [i] bit mask */
+	__u64 usr_ptr;		/* [i->o] unused internally */
+	__u32 spare_in;		/* [i] */
 
-	u32 driver_status;	/* [o] 0 -> ok */
-	u32 transport_status;	/* [o] 0 -> ok */
-	u32 device_status;	/* [o] {SCSI: command completion status} */
-	u32 retry_delay;	/* [o] {SCSI: status auxiliary information} */
-	u32 info;		/* [o] additional information */
-	u32 duration;		/* [o] time to complete, in milliseconds */
-	u32 response_len;	/* [o] bytes of response actually written */
-	s32 din_resid;		/* [o] actual_din_xfer_len - din_xfer_len */
-	u32 generated_tag;	/* [o] {SCSI: task tag that transport chose} */
-	u32 spare_out;		/* [o] */
+	__u32 driver_status;	/* [o] 0 -> ok */
+	__u32 transport_status;	/* [o] 0 -> ok */
+	__u32 device_status;	/* [o] {SCSI: command completion status} */
+	__u32 retry_delay;	/* [o] {SCSI: status auxiliary information} */
+	__u32 info;		/* [o] additional information */
+	__u32 duration;		/* [o] time to complete, in milliseconds */
+	__u32 response_len;	/* [o] bytes of response actually written */
+	__s32 din_resid;	/* [o] actual_din_xfer_len - din_xfer_len */
+	__u32 generated_tag;	/* [o] {SCSI: task tag that transport chose} */
+	__u32 spare_out;	/* [o] */
 
-	u32 padding;
+	__u32 padding;
 };
 
 #ifdef __KERNEL__
-- 
cgit v1.3-8-gc7d7


From 45e79a3acdcf54113b3d7b23e9e64e6541dbfeb5 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 9 Jul 2007 12:39:20 +0200
Subject: bsg: add a request_queue argument to scsi_cmd_ioctl()

bsg uses scsi_cmd_ioctl() for some SCSI/sg ioctl
commands. scsi_cmd_ioctl() gets a request queue from a gendisk
arguement. This prevents bsg being bound to SCSI devices that don't
have a gendisk (like OSD). This adds a request_queue argument to
scsi_cmd_ioctl(). The SCSI/sg ioctl commands doesn't use a gendisk so
it's safe for any SCSI devices to use scsi_cmd_ioctl().

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/bsg.c            |  2 +-
 block/scsi_ioctl.c     | 10 +++-------
 drivers/block/ub.c     |  2 +-
 drivers/cdrom/cdrom.c  |  3 ++-
 drivers/ide/ide.c      |  2 +-
 drivers/scsi/sd.c      |  2 +-
 drivers/scsi/st.c      |  3 ++-
 include/linux/blkdev.h |  3 ++-
 8 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/block/bsg.c b/block/bsg.c
index c85d961ee41e..0427ece9b6d8 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -900,7 +900,7 @@ bsg_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 	case SG_EMULATED_HOST:
 	case SCSI_IOCTL_SEND_COMMAND: {
 		void __user *uarg = (void __user *) arg;
-		return scsi_cmd_ioctl(file, bd->disk, cmd, uarg);
+		return scsi_cmd_ioctl(file, bd->queue, bd->disk, cmd, uarg);
 	}
 	case SG_IO: {
 		struct request *rq;
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index db53b2c268d3..a26ba07955fe 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -548,16 +548,12 @@ static inline int blk_send_start_stop(request_queue_t *q, struct gendisk *bd_dis
 	return __blk_send_generic(q, bd_disk, GPCMD_START_STOP_UNIT, data);
 }
 
-int scsi_cmd_ioctl(struct file *file, struct gendisk *bd_disk, unsigned int cmd, void __user *arg)
+int scsi_cmd_ioctl(struct file *file, struct request_queue *q,
+		   struct gendisk *bd_disk, unsigned int cmd, void __user *arg)
 {
-	request_queue_t *q;
 	int err;
 
-	q = bd_disk->queue;
-	if (!q)
-		return -ENXIO;
-
-	if (blk_get_queue(q))
+	if (!q || blk_get_queue(q))
 		return -ENXIO;
 
 	switch (cmd) {
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 18c8b6c0db20..8b13d7d2cb63 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -1709,7 +1709,7 @@ static int ub_bd_ioctl(struct inode *inode, struct file *filp,
 	struct gendisk *disk = inode->i_bdev->bd_disk;
 	void __user *usermem = (void __user *) arg;
 
-	return scsi_cmd_ioctl(filp, disk, cmd, usermem);
+	return scsi_cmd_ioctl(filp, disk->queue, disk, cmd, usermem);
 }
 
 /*
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index aa5468f487ba..499019bf8f40 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2695,11 +2695,12 @@ int cdrom_ioctl(struct file * file, struct cdrom_device_info *cdi,
 {
 	void __user *argp = (void __user *)arg;
 	int ret;
+	struct gendisk *disk = ip->i_bdev->bd_disk;
 
 	/*
 	 * Try the generic SCSI command ioctl's first.
 	 */
-	ret = scsi_cmd_ioctl(file, ip->i_bdev->bd_disk, cmd, argp);
+	ret = scsi_cmd_ioctl(file, disk->queue, disk, cmd, argp);
 	if (ret != -ENOTTY)
 		return ret;
 
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 9ae60a7400a2..8cd7694593c9 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -1052,7 +1052,7 @@ int generic_ide_ioctl(ide_drive_t *drive, struct file *file, struct block_device
 	int err, (*setfunc)(ide_drive_t *, int);
 	u8 *val;
 
-	err = scsi_cmd_ioctl(file, bdev->bd_disk, cmd, p);
+	err = scsi_cmd_ioctl(file, bdev->bd_disk->queue, bdev->bd_disk, cmd, p);
 	if (err != -ENOTTY)
 		return err;
 
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 448d316f12d7..424d557284a9 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -684,7 +684,7 @@ static int sd_ioctl(struct inode * inode, struct file * filp,
 		case SCSI_IOCTL_GET_BUS_NUMBER:
 			return scsi_ioctl(sdp, cmd, p);
 		default:
-			error = scsi_cmd_ioctl(filp, disk, cmd, p);
+			error = scsi_cmd_ioctl(filp, disk->queue, disk, cmd, p);
 			if (error != -ENOTTY)
 				return error;
 	}
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 55bfeccf68a2..a4f7b8465773 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -3549,7 +3549,8 @@ static int st_ioctl(struct inode *inode, struct file *file,
 			    !capable(CAP_SYS_RAWIO))
 				i = -EPERM;
 			else
-				i = scsi_cmd_ioctl(file, STp->disk, cmd_in, p);
+				i = scsi_cmd_ioctl(file, STp->disk->queue,
+						   STp->disk, cmd_in, p);
 			if (i != -ENOTTY)
 				return i;
 			break;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f6bc0d03ffad..2746632c2267 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -644,7 +644,8 @@ extern void blk_requeue_request(request_queue_t *, struct request *);
 extern void blk_plug_device(request_queue_t *);
 extern int blk_remove_plug(request_queue_t *);
 extern void blk_recount_segments(request_queue_t *, struct bio *);
-extern int scsi_cmd_ioctl(struct file *, struct gendisk *, unsigned int, void __user *);
+extern int scsi_cmd_ioctl(struct file *, struct request_queue *,
+			  struct gendisk *, unsigned int, void __user *);
 extern int sg_scsi_ioctl(struct file *, struct request_queue *,
 		struct gendisk *, struct scsi_ioctl_command __user *);
 
-- 
cgit v1.3-8-gc7d7


From d351af01b9307566135cb0f355ca65d0952c10b5 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 9 Jul 2007 12:40:35 +0200
Subject: bsg: bind bsg to request_queue instead of gendisk

This patch binds bsg devices to request_queue instead of gendisk. Any
objects (like transport entities) can define own request_handler and
create own bsg device.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/bsg.c            | 37 +++++++++++++++++--------------------
 block/ll_rw_blk.c      |  4 ++--
 include/linux/blkdev.h |  5 +++++
 include/linux/bsg.h    | 10 +++++-----
 include/linux/genhd.h  |  2 --
 5 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/block/bsg.c b/block/bsg.c
index 0427ece9b6d8..4ea4bedb413f 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -34,7 +34,6 @@
 static char bsg_version[] = "block layer sg (bsg) 0.4";
 
 struct bsg_device {
-	struct gendisk *disk;
 	request_queue_t *queue;
 	spinlock_t lock;
 	struct list_head busy_list;
@@ -46,7 +45,7 @@ struct bsg_device {
 	int done_cmds;
 	wait_queue_head_t wq_done;
 	wait_queue_head_t wq_free;
-	char name[BDEVNAME_SIZE];
+	char name[BUS_ID_SIZE];
 	int max_queue;
 	unsigned long flags;
 };
@@ -375,7 +374,7 @@ static void bsg_add_command(struct bsg_device *bd, request_queue_t *q,
 	dprintk("%s: queueing rq %p, bc %p\n", bd->name, rq, bc);
 
 	rq->end_io_data = bc;
-	blk_execute_rq_nowait(q, bd->disk, rq, 1, bsg_rq_end_io);
+	blk_execute_rq_nowait(q, NULL, rq, 1, bsg_rq_end_io);
 }
 
 static inline struct bsg_command *bsg_next_done_cmd(struct bsg_device *bd)
@@ -741,7 +740,7 @@ out:
 }
 
 static struct bsg_device *bsg_add_device(struct inode *inode,
-					 struct gendisk *disk,
+					 struct request_queue *rq,
 					 struct file *file)
 {
 	struct bsg_device *bd = NULL;
@@ -753,17 +752,16 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
 	if (!bd)
 		return ERR_PTR(-ENOMEM);
 
-	bd->disk = disk;
-	bd->queue = disk->queue;
-	kobject_get(&disk->queue->kobj);
+	bd->queue = rq;
+	kobject_get(&rq->kobj);
 	bsg_set_block(bd, file);
 
 	atomic_set(&bd->ref_count, 1);
 	bd->minor = iminor(inode);
 	mutex_lock(&bsg_mutex);
-	hlist_add_head(&bd->dev_list,&bsg_device_list[bsg_list_idx(bd->minor)]);
+	hlist_add_head(&bd->dev_list, &bsg_device_list[bsg_list_idx(bd->minor)]);
 
-	strncpy(bd->name, disk->disk_name, sizeof(bd->name) - 1);
+	strncpy(bd->name, rq->bsg_dev.class_dev->class_id, sizeof(bd->name) - 1);
 	dprintk("bound to <%s>, max queue %d\n",
 		format_dev_t(buf, inode->i_rdev), bd->max_queue);
 
@@ -817,7 +815,7 @@ static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file)
 	if (!bcd)
 		return ERR_PTR(-ENODEV);
 
-	return bsg_add_device(inode, bcd->disk, file);
+	return bsg_add_device(inode, bcd->queue, file);
 }
 
 static int bsg_open(struct inode *inode, struct file *file)
@@ -900,7 +898,7 @@ bsg_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 	case SG_EMULATED_HOST:
 	case SCSI_IOCTL_SEND_COMMAND: {
 		void __user *uarg = (void __user *) arg;
-		return scsi_cmd_ioctl(file, bd->queue, bd->disk, cmd, uarg);
+		return scsi_cmd_ioctl(file, bd->queue, NULL, cmd, uarg);
 	}
 	case SG_IO: {
 		struct request *rq;
@@ -915,7 +913,7 @@ bsg_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 			return PTR_ERR(rq);
 
 		bio = rq->bio;
-		blk_execute_rq(bd->queue, bd->disk, rq, 0);
+		blk_execute_rq(bd->queue, NULL, rq, 0);
 		blk_complete_sgv4_hdr_rq(rq, &hdr, bio);
 
 		if (copy_to_user(uarg, &hdr, sizeof(hdr)))
@@ -945,24 +943,23 @@ static struct file_operations bsg_fops = {
 	.owner		=	THIS_MODULE,
 };
 
-void bsg_unregister_disk(struct gendisk *disk)
+void bsg_unregister_queue(struct request_queue *q)
 {
-	struct bsg_class_device *bcd = &disk->bsg_dev;
+	struct bsg_class_device *bcd = &q->bsg_dev;
 
 	if (!bcd->class_dev)
 		return;
 
 	mutex_lock(&bsg_mutex);
-	sysfs_remove_link(&bcd->disk->queue->kobj, "bsg");
+	sysfs_remove_link(&q->kobj, "bsg");
 	class_device_destroy(bsg_class, MKDEV(BSG_MAJOR, bcd->minor));
 	bcd->class_dev = NULL;
 	list_del_init(&bcd->list);
 	mutex_unlock(&bsg_mutex);
 }
 
-int bsg_register_disk(struct gendisk *disk)
+int bsg_register_queue(struct request_queue *q, char *name)
 {
-	request_queue_t *q = disk->queue;
 	struct bsg_class_device *bcd;
 	dev_t dev;
 
@@ -972,7 +969,7 @@ int bsg_register_disk(struct gendisk *disk)
 	if (!q->request_fn)
 		return 0;
 
-	bcd = &disk->bsg_dev;
+	bcd = &q->bsg_dev;
 	memset(bcd, 0, sizeof(*bcd));
 	INIT_LIST_HEAD(&bcd->list);
 
@@ -980,8 +977,8 @@ int bsg_register_disk(struct gendisk *disk)
 	dev = MKDEV(BSG_MAJOR, bsg_device_nr);
 	bcd->minor = bsg_device_nr;
 	bsg_device_nr++;
-	bcd->disk = disk;
-	bcd->class_dev = class_device_create(bsg_class, NULL, dev, bcd->dev, "%s", disk->disk_name);
+	bcd->queue = q;
+	bcd->class_dev = class_device_create(bsg_class, NULL, dev, bcd->dev, "%s", name);
 	if (!bcd->class_dev)
 		goto err;
 	list_add_tail(&bcd->list, &bsg_class_list);
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 3795e0708a22..74a5498c29a1 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -4091,7 +4091,7 @@ int blk_register_queue(struct gendisk *disk)
 		return ret;
 	}
 
-	ret = bsg_register_disk(disk);
+	ret = bsg_register_queue(q, disk->disk_name);
 	if (ret) {
 		elv_unregister_queue(q);
 		kobject_unregister(&q->kobj);
@@ -4106,7 +4106,7 @@ void blk_unregister_queue(struct gendisk *disk)
 	request_queue_t *q = disk->queue;
 
 	if (q && q->request_fn) {
-		bsg_unregister_disk(disk);
+		bsg_unregister_queue(q);
 		elv_unregister_queue(q);
 
 		kobject_uevent(&q->kobj, KOBJ_REMOVE);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2746632c2267..24b474e05a44 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -14,6 +14,7 @@
 #include <linux/bio.h>
 #include <linux/module.h>
 #include <linux/stringify.h>
+#include <linux/bsg.h>
 
 #include <asm/scatterlist.h>
 
@@ -470,6 +471,10 @@ struct request_queue
 	unsigned int		bi_size;
 
 	struct mutex		sysfs_lock;
+
+#if defined(CONFIG_BLK_DEV_BSG)
+	struct bsg_class_device bsg_dev;
+#endif
 };
 
 #define QUEUE_FLAG_CLUSTER	0	/* cluster several segments into 1 */
diff --git a/include/linux/bsg.h b/include/linux/bsg.h
index 2154a6dfbd53..0475a6d3ff6a 100644
--- a/include/linux/bsg.h
+++ b/include/linux/bsg.h
@@ -47,16 +47,16 @@ struct bsg_class_device {
 	struct class_device *class_dev;
 	struct device *dev;
 	int minor;
-	struct gendisk *disk;
 	struct list_head list;
+	struct request_queue *queue;
 };
 
-extern int bsg_register_disk(struct gendisk *);
-extern void bsg_unregister_disk(struct gendisk *);
+extern int bsg_register_queue(struct request_queue *, char *);
+extern void bsg_unregister_queue(struct request_queue *);
 #else
 struct bsg_class_device { };
-#define bsg_register_disk(disk)		(0)
-#define bsg_unregister_disk(disk)	do { } while (0)
+#define bsg_register_queue(disk, name)		(0)
+#define bsg_unregister_queue(disk)	do { } while (0)
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 8c43d7032612..9756fc102a83 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -67,7 +67,6 @@ struct partition {
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/workqueue.h>
-#include <linux/bsg.h>
 
 struct partition {
 	unsigned char boot_ind;		/* 0x80 - active */
@@ -92,7 +91,6 @@ struct hd_struct {
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	int make_it_fail;
 #endif
-	struct bsg_class_device bsg_dev;
 };
 
 #define GENHD_FL_REMOVABLE			1
-- 
cgit v1.3-8-gc7d7


From 4cf0723ac89b5f2189da2ad07ef875de26b83c77 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 30 Mar 2007 11:19:39 +0200
Subject: bsg: minor bug fixes

This fixes the following minor issues:

- add EXPORT_SYMBOL_GPL for bsg_register_queue and
bsg_unregister_queue.

- shut up gcc warnings

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <axboe@nelson.home.kernel.dk>
---
 block/bsg.c         | 4 +++-
 include/linux/bsg.h | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/bsg.c b/block/bsg.c
index 4ef3cc550244..a333c9337093 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -961,8 +961,9 @@ void bsg_unregister_queue(struct request_queue *q)
 	bsg_device_nr--;
 	mutex_unlock(&bsg_mutex);
 }
+EXPORT_SYMBOL_GPL(bsg_unregister_queue);
 
-int bsg_register_queue(struct request_queue *q, char *name)
+int bsg_register_queue(struct request_queue *q, const char *name)
 {
 	struct bsg_class_device *bcd, *__bcd;
 	dev_t dev;
@@ -1025,6 +1026,7 @@ err:
 	mutex_unlock(&bsg_mutex);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(bsg_register_queue);
 
 static int bsg_add(struct class_device *cl_dev, struct class_interface *cl_intf)
 {
diff --git a/include/linux/bsg.h b/include/linux/bsg.h
index 0475a6d3ff6a..0dd01f90ba5e 100644
--- a/include/linux/bsg.h
+++ b/include/linux/bsg.h
@@ -51,7 +51,7 @@ struct bsg_class_device {
 	struct request_queue *queue;
 };
 
-extern int bsg_register_queue(struct request_queue *, char *);
+extern int bsg_register_queue(struct request_queue *, const char *);
 extern void bsg_unregister_queue(struct request_queue *);
 #else
 struct bsg_class_device { };
-- 
cgit v1.3-8-gc7d7


From abae1fde63fcdd2a3abaa0d7930938d8326f83d2 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 16 Jul 2007 08:52:14 +0200
Subject: add a struct request pointer to the request structure

This adds a struct request pointer to the request structure for the
second data phase (bidi for now). A request queue supporting bidi
requests sets QUEUE_FLAG_BIDI. This prevents sending bidi requests to
a non-bidi queue.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/ll_rw_blk.c      | 1 +
 include/linux/blkdev.h | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index ef42bb2b12b6..11e4235d0b0c 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -256,6 +256,7 @@ static void rq_init(request_queue_t *q, struct request *rq)
 	rq->end_io = NULL;
 	rq->end_io_data = NULL;
 	rq->completion_data = NULL;
+	rq->next_rq = NULL;
 }
 
 /**
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 24b474e05a44..b32564a1e105 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -317,6 +317,9 @@ struct request {
 	 */
 	rq_end_io_fn *end_io;
 	void *end_io_data;
+
+	/* for bidi */
+	struct request *next_rq;
 };
 
 /*
@@ -486,6 +489,7 @@ struct request_queue
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
 #define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
 #define QUEUE_FLAG_ELVSWITCH	8	/* don't use elevator, just do FIFO */
+#define QUEUE_FLAG_BIDI		9	/* queue supports bidi requests */
 
 enum {
 	/*
@@ -550,6 +554,7 @@ enum {
 #define blk_sorted_rq(rq)	((rq)->cmd_flags & REQ_SORTED)
 #define blk_barrier_rq(rq)	((rq)->cmd_flags & REQ_HARDBARRIER)
 #define blk_fua_rq(rq)		((rq)->cmd_flags & REQ_FUA)
+#define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
 
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
-- 
cgit v1.3-8-gc7d7


From 15d10b611fa94b52f004a08a1d4cf7b39de3cba3 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 16 Jul 2007 08:52:16 +0200
Subject: bsg: add SCSI transport-level request support

This enables bsg to handle SCSI transport-level request like SAS
management protocol (SMP).

- add BSG_SUB_PROTOCOL_{SCSI_CMD, SCSI_TMF, SCSI_TRANSPORT} definitions.
- SCSI transport-level requests skip blk_verify_command().

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/bsg.c         | 27 +++++++++++++++++++++------
 include/linux/bsg.h |  6 ++++++
 2 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/block/bsg.c b/block/bsg.c
index 13ecc951a4c0..461c9f56f3ee 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -208,7 +208,11 @@ static int blk_fill_sgv4_hdr_rq(request_queue_t *q, struct request *rq,
 	if (copy_from_user(rq->cmd, (void *)(unsigned long)hdr->request,
 			   hdr->request_len))
 		return -EFAULT;
-	if (blk_verify_command(rq->cmd, has_write_perm))
+
+	if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) {
+		if (blk_verify_command(rq->cmd, has_write_perm))
+			return -EPERM;
+	} else if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
 
 	/*
@@ -232,6 +236,8 @@ static int blk_fill_sgv4_hdr_rq(request_queue_t *q, struct request *rq,
 static int
 bsg_validate_sgv4_hdr(request_queue_t *q, struct sg_io_v4 *hdr, int *rw)
 {
+	int ret = 0;
+
 	if (hdr->guard != 'Q')
 		return -EINVAL;
 	if (hdr->request_len > BLK_MAX_CDB)
@@ -240,13 +246,22 @@ bsg_validate_sgv4_hdr(request_queue_t *q, struct sg_io_v4 *hdr, int *rw)
 	    hdr->din_xfer_len > (q->max_sectors << 9))
 		return -EIO;
 
-	/* not supported currently */
-	if (hdr->protocol || hdr->subprotocol)
-		return -EINVAL;
+	switch (hdr->protocol) {
+	case BSG_PROTOCOL_SCSI:
+		switch (hdr->subprotocol) {
+		case BSG_SUB_PROTOCOL_SCSI_CMD:
+		case BSG_SUB_PROTOCOL_SCSI_TRANSPORT:
+			break;
+		default:
+			ret = -EINVAL;
+		}
+		break;
+	default:
+		ret = -EINVAL;
+	}
 
 	*rw = hdr->dout_xfer_len ? WRITE : READ;
-
-	return 0;
+	return ret;
 }
 
 /*
diff --git a/include/linux/bsg.h b/include/linux/bsg.h
index 0dd01f90ba5e..bd998ca6cb2e 100644
--- a/include/linux/bsg.h
+++ b/include/linux/bsg.h
@@ -1,6 +1,12 @@
 #ifndef BSG_H
 #define BSG_H
 
+#define BSG_PROTOCOL_SCSI		0
+
+#define BSG_SUB_PROTOCOL_SCSI_CMD	0
+#define BSG_SUB_PROTOCOL_SCSI_TMF	1
+#define BSG_SUB_PROTOCOL_SCSI_TRANSPORT	2
+
 struct sg_io_v4 {
 	__s32 guard;		/* [i] 'Q' to differentiate from v3 */
 	__u32 protocol;		/* [i] 0 -> SCSI , .... */
-- 
cgit v1.3-8-gc7d7


From d6d281684913dabb878e2f53219eed5df2cd867b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 28 Jun 2007 08:38:16 -0400
Subject: KVM: Remove kvmfs in favor of the anonymous inodes source

kvm uses a pseudo filesystem, kvmfs, to generate inodes, a job that the
new anonymous inodes source does much better.

Cc: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 drivers/kvm/kvm_main.c | 143 ++++---------------------------------------------
 fs/anon_inodes.c       |   1 +
 include/linux/magic.h  |   1 -
 3 files changed, 12 insertions(+), 133 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 5603000573ec..26ca90f74fc7 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -43,6 +43,7 @@
 #include <linux/sched.h>
 #include <linux/cpumask.h>
 #include <linux/smp.h>
+#include <linux/anon_inodes.h>
 
 #include "x86_emulate.h"
 #include "segment_descriptor.h"
@@ -81,8 +82,6 @@ static struct kvm_stats_debugfs_item {
 
 static struct dentry *debugfs_dir;
 
-struct vfsmount *kvmfs_mnt;
-
 #define MAX_IO_MSRS 256
 
 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
@@ -104,55 +103,6 @@ struct segment_descriptor_64 {
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 			   unsigned long arg);
 
-static struct inode *kvmfs_inode(struct file_operations *fops)
-{
-	int error = -ENOMEM;
-	struct inode *inode = new_inode(kvmfs_mnt->mnt_sb);
-
-	if (!inode)
-		goto eexit_1;
-
-	inode->i_fop = fops;
-
-	/*
-	 * Mark the inode dirty from the very beginning,
-	 * that way it will never be moved to the dirty
-	 * list because mark_inode_dirty() will think
-	 * that it already _is_ on the dirty list.
-	 */
-	inode->i_state = I_DIRTY;
-	inode->i_mode = S_IRUSR | S_IWUSR;
-	inode->i_uid = current->fsuid;
-	inode->i_gid = current->fsgid;
-	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	return inode;
-
-eexit_1:
-	return ERR_PTR(error);
-}
-
-static struct file *kvmfs_file(struct inode *inode, void *private_data)
-{
-	struct file *file = get_empty_filp();
-
-	if (!file)
-		return ERR_PTR(-ENFILE);
-
-	file->f_path.mnt = mntget(kvmfs_mnt);
-	file->f_path.dentry = d_alloc_anon(inode);
-	if (!file->f_path.dentry)
-		return ERR_PTR(-ENOMEM);
-	file->f_mapping = inode->i_mapping;
-
-	file->f_pos = 0;
-	file->f_flags = O_RDWR;
-	file->f_op = inode->i_fop;
-	file->f_mode = FMODE_READ | FMODE_WRITE;
-	file->f_version = 0;
-	file->private_data = private_data;
-	return file;
-}
-
 unsigned long segment_base(u16 selector)
 {
 	struct descriptor_table gdt;
@@ -2413,34 +2363,12 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
 	struct inode *inode;
 	struct file *file;
 
+	r = anon_inode_getfd(&fd, &inode, &file,
+			     "kvm-vcpu", &kvm_vcpu_fops, vcpu);
+	if (r)
+		return r;
 	atomic_inc(&vcpu->kvm->filp->f_count);
-	inode = kvmfs_inode(&kvm_vcpu_fops);
-	if (IS_ERR(inode)) {
-		r = PTR_ERR(inode);
-		goto out1;
-	}
-
-	file = kvmfs_file(inode, vcpu);
-	if (IS_ERR(file)) {
-		r = PTR_ERR(file);
-		goto out2;
-	}
-
-	r = get_unused_fd();
-	if (r < 0)
-		goto out3;
-	fd = r;
-	fd_install(fd, file);
-
 	return fd;
-
-out3:
-	fput(file);
-out2:
-	iput(inode);
-out1:
-	fput(vcpu->kvm->filp);
-	return r;
 }
 
 /*
@@ -2905,41 +2833,18 @@ static int kvm_dev_ioctl_create_vm(void)
 	struct file *file;
 	struct kvm *kvm;
 
-	inode = kvmfs_inode(&kvm_vm_fops);
-	if (IS_ERR(inode)) {
-		r = PTR_ERR(inode);
-		goto out1;
-	}
-
 	kvm = kvm_create_vm();
-	if (IS_ERR(kvm)) {
-		r = PTR_ERR(kvm);
-		goto out2;
+	if (IS_ERR(kvm))
+		return PTR_ERR(kvm);
+	r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
+	if (r) {
+		kvm_destroy_vm(kvm);
+		return r;
 	}
 
-	file = kvmfs_file(inode, kvm);
-	if (IS_ERR(file)) {
-		r = PTR_ERR(file);
-		goto out3;
-	}
 	kvm->filp = file;
 
-	r = get_unused_fd();
-	if (r < 0)
-		goto out4;
-	fd = r;
-	fd_install(fd, file);
-
 	return fd;
-
-out4:
-	fput(file);
-out3:
-	kvm_destroy_vm(kvm);
-out2:
-	iput(inode);
-out1:
-	return r;
 }
 
 static long kvm_dev_ioctl(struct file *filp,
@@ -3211,18 +3116,6 @@ static struct sys_device kvm_sysdev = {
 
 hpa_t bad_page_address;
 
-static int kvmfs_get_sb(struct file_system_type *fs_type, int flags,
-			const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	return get_sb_pseudo(fs_type, "kvm:", NULL, KVMFS_SUPER_MAGIC, mnt);
-}
-
-static struct file_system_type kvm_fs_type = {
-	.name		= "kvmfs",
-	.get_sb		= kvmfs_get_sb,
-	.kill_sb	= kill_anon_super,
-};
-
 int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
 {
 	int r;
@@ -3307,14 +3200,6 @@ static __init int kvm_init(void)
 	if (r)
 		goto out4;
 
-	r = register_filesystem(&kvm_fs_type);
-	if (r)
-		goto out3;
-
-	kvmfs_mnt = kern_mount(&kvm_fs_type);
-	r = PTR_ERR(kvmfs_mnt);
-	if (IS_ERR(kvmfs_mnt))
-		goto out2;
 	kvm_init_debug();
 
 	kvm_init_msr_list();
@@ -3331,10 +3216,6 @@ static __init int kvm_init(void)
 
 out:
 	kvm_exit_debug();
-	mntput(kvmfs_mnt);
-out2:
-	unregister_filesystem(&kvm_fs_type);
-out3:
 	kvm_mmu_module_exit();
 out4:
 	return r;
@@ -3344,8 +3225,6 @@ static __exit void kvm_exit(void)
 {
 	kvm_exit_debug();
 	__free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
-	mntput(kvmfs_mnt);
-	unregister_filesystem(&kvm_fs_type);
 	kvm_mmu_module_exit();
 }
 
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 40fe3a3222e4..edc67486238f 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -139,6 +139,7 @@ err_put_filp:
 	put_filp(file);
 	return error;
 }
+EXPORT_SYMBOL_GPL(anon_inode_getfd);
 
 /*
  * A single inode exist for all anon_inode files. Contrary to pipes,
diff --git a/include/linux/magic.h b/include/linux/magic.h
index 9d713c03e3da..36cc20dfd142 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -13,7 +13,6 @@
 #define HPFS_SUPER_MAGIC	0xf995e849
 #define ISOFS_SUPER_MAGIC	0x9660
 #define JFFS2_SUPER_MAGIC	0x72b6
-#define KVMFS_SUPER_MAGIC	0x19700426
 #define ANON_INODE_FS_MAGIC	0x09041934
 
 #define MINIX_SUPER_MAGIC	0x137F		/* original minix fs */
-- 
cgit v1.3-8-gc7d7


From db912f963909b3cbc3a059b7528f6a1a1eb6ffae Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 24 May 2007 12:23:10 +0300
Subject: HOTPLUG: Add CPU_DYING notifier

KVM wants a notification when a cpu is about to die, so it can disable
hardware extensions, but at a time when user processes cannot be scheduled
on the cpu, so it doesn't try to use virtualization extensions after they
have been disabled.

This adds a CPU_DYING notification.  The notification is called in atomic
context on the doomed cpu.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 include/linux/notifier.h |  3 +++
 kernel/cpu.c             | 16 ++++++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 9431101bf876..576f2bb34cc8 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -196,6 +196,8 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
 #define CPU_DEAD		0x0007 /* CPU (unsigned)v dead */
 #define CPU_LOCK_ACQUIRE	0x0008 /* Acquire all hotcpu locks */
 #define CPU_LOCK_RELEASE	0x0009 /* Release all hotcpu locks */
+#define CPU_DYING		0x000A /* CPU (unsigned)v not running any task,
+				        * not handling interrupts, soon dead */
 
 /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
  * operation in progress
@@ -208,6 +210,7 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
 #define CPU_DOWN_PREPARE_FROZEN	(CPU_DOWN_PREPARE | CPU_TASKS_FROZEN)
 #define CPU_DOWN_FAILED_FROZEN	(CPU_DOWN_FAILED | CPU_TASKS_FROZEN)
 #define CPU_DEAD_FROZEN		(CPU_DEAD | CPU_TASKS_FROZEN)
+#define CPU_DYING_FROZEN	(CPU_DYING | CPU_TASKS_FROZEN)
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_NOTIFIER_H */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 208cf3497c10..181ae7086029 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -103,11 +103,19 @@ static inline void check_for_tasks(int cpu)
 	write_unlock_irq(&tasklist_lock);
 }
 
+struct take_cpu_down_param {
+	unsigned long mod;
+	void *hcpu;
+};
+
 /* Take this CPU down. */
-static int take_cpu_down(void *unused)
+static int take_cpu_down(void *_param)
 {
+	struct take_cpu_down_param *param = _param;
 	int err;
 
+	raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
+				param->hcpu);
 	/* Ensure this CPU doesn't handle any more interrupts. */
 	err = __cpu_disable();
 	if (err < 0)
@@ -127,6 +135,10 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 	cpumask_t old_allowed, tmp;
 	void *hcpu = (void *)(long)cpu;
 	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
+	struct take_cpu_down_param tcd_param = {
+		.mod = mod,
+		.hcpu = hcpu,
+	};
 
 	if (num_online_cpus() == 1)
 		return -EBUSY;
@@ -153,7 +165,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 	set_cpus_allowed(current, tmp);
 
 	mutex_lock(&cpu_bitmask_lock);
-	p = __stop_machine_run(take_cpu_down, NULL, cpu);
+	p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
 	mutex_unlock(&cpu_bitmask_lock);
 
 	if (IS_ERR(p) || cpu_online(cpu)) {
-- 
cgit v1.3-8-gc7d7


From a52b1752c077cb919b71167c54968a0b91673281 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Mon, 9 Jul 2007 17:11:49 +0300
Subject: SMP: Allow smp_call_function_single() to current cpu

This removes the requirement for callers to get_cpu() to check in simple
cases.  This patch is for !CONFIG_SMP.

Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 include/linux/smp.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 96ac21f8dd73..8039daced688 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -7,6 +7,7 @@
  */
 
 #include <linux/errno.h>
+#include <asm/system.h>
 
 extern void cpu_idle(void);
 
@@ -102,7 +103,11 @@ static inline void smp_send_reschedule(int cpu) { }
 static inline int smp_call_function_single(int cpuid, void (*func) (void *info),
 					   void *info, int retry, int wait)
 {
-	return -EBUSY;
+	WARN_ON(cpuid != 0);
+	local_irq_disable();
+	func(info);
+	local_irq_enable();
+	return 0;
 }
 
 #endif /* !SMP */
-- 
cgit v1.3-8-gc7d7


From f2a11b158a24301e9158e9c873fa88e5eb775486 Mon Sep 17 00:00:00 2001
From: Nitin Gupta <nitingupta910@gmail.com>
Date: Sun, 15 Jul 2007 23:37:21 -0700
Subject: LZO1X: fix lzo1x_worst_compress

This is a correction for a macro which gives worst case compressed data
size by LZO1X.

This patch was provided by the LZO author (Markus Oberhumer).

Signed-off-by: Nitin Gupta <nitingupta910@gmail.com>
Cc: "Markus F.X.J. Oberhumer" <markus@oberhumer.com>
Cc: "Richard Purdie" <rpurdie@openedhand.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lzo.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lzo.h b/include/linux/lzo.h
index 582d8b711a13..d793497ec1ca 100644
--- a/include/linux/lzo.h
+++ b/include/linux/lzo.h
@@ -17,7 +17,7 @@
 #define LZO1X_MEM_COMPRESS	(16384 * sizeof(unsigned char *))
 #define LZO1X_1_MEM_COMPRESS	LZO1X_MEM_COMPRESS
 
-#define lzo1x_worst_compress(x) (x + (x / 64) + 16 + 3)
+#define lzo1x_worst_compress(x) ((x) + ((x) / 16) + 64 + 3)
 
 /* This requires 'workmem' of size LZO1X_1_MEM_COMPRESS */
 int lzo1x_1_compress(const unsigned char *src, size_t src_len,
-- 
cgit v1.3-8-gc7d7


From 96d7fa421e6424ad9ef6d1d039375dc2edb63fe8 Mon Sep 17 00:00:00 2001
From: Kristian Hoegsberg <krh@redhat.com>
Date: Sun, 15 Jul 2007 23:37:24 -0700
Subject: lib: add idr_for_each()

This patch adds an iterator function for the idr data structure.  Compared
to just iterating through the idr with an integer and idr_find, this
iterator is (almost, but not quite) linear in the number of elements, as
opposed to the number of integers in the range covered by the idr.  This
makes a difference for sparse idrs, but more importantly, it's a nicer way
to iterate through the elements.

The drm subsystem is moving to idr for tracking contexts and drawables, and
with this change, we can use the idr exclusively for tracking these
resources.

[akpm@linux-foundation.org: fix comment]
Signed-off-by: Kristian Hoegsberg <krh@redhat.com>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Dave Airlie <airlied@linux.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/idr.h |  2 ++
 lib/idr.c           | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 915572fa030b..8442c0bffc06 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -78,6 +78,8 @@ void *idr_find(struct idr *idp, int id);
 int idr_pre_get(struct idr *idp, gfp_t gfp_mask);
 int idr_get_new(struct idr *idp, void *ptr, int *id);
 int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id);
+int idr_for_each(struct idr *idp,
+		 int (*fn)(int id, void *p, void *data), void *data);
 void *idr_replace(struct idr *idp, void *ptr, int id);
 void idr_remove(struct idr *idp, int id);
 void idr_destroy(struct idr *idp);
diff --git a/lib/idr.c b/lib/idr.c
index b98f01a2eb94..d29da6159dae 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -436,6 +436,61 @@ void *idr_find(struct idr *idp, int id)
 }
 EXPORT_SYMBOL(idr_find);
 
+/**
+ * idr_for_each - iterate through all stored pointers
+ * @idp: idr handle
+ * @fn: function to be called for each pointer
+ * @data: data passed back to callback function
+ *
+ * Iterate over the pointers registered with the given idr.  The
+ * callback function will be called for each pointer currently
+ * registered, passing the id, the pointer and the data pointer passed
+ * to this function.  It is not safe to modify the idr tree while in
+ * the callback, so functions such as idr_get_new and idr_remove are
+ * not allowed.
+ *
+ * We check the return of @fn each time. If it returns anything other
+ * than 0, we break out and return that value.
+ *
+ * The caller must serialize idr_for_each() vs idr_get_new() and idr_remove().
+ */
+int idr_for_each(struct idr *idp,
+		 int (*fn)(int id, void *p, void *data), void *data)
+{
+	int n, id, max, error = 0;
+	struct idr_layer *p;
+	struct idr_layer *pa[MAX_LEVEL];
+	struct idr_layer **paa = &pa[0];
+
+	n = idp->layers * IDR_BITS;
+	p = idp->top;
+	max = 1 << n;
+
+	id = 0;
+	while (id < max) {
+		while (n > 0 && p) {
+			n -= IDR_BITS;
+			*paa++ = p;
+			p = p->ary[(id >> n) & IDR_MASK];
+		}
+
+		if (p) {
+			error = fn(id, (void *)p, data);
+			if (error)
+				break;
+		}
+
+		id += 1 << n;
+		while (n < fls(id)) {
+			n += IDR_BITS;
+			p = *--paa;
+		}
+	}
+
+	return error;
+}
+EXPORT_SYMBOL(idr_for_each);
+
 /**
  * idr_replace - replace pointer for given id
  * @idp: idr handle
-- 
cgit v1.3-8-gc7d7


From 23936cc0b5d89619c34c2dab11d8cf3d6f7ca028 Mon Sep 17 00:00:00 2001
From: Kristian Hoegsberg <krh@redhat.com>
Date: Sun, 15 Jul 2007 23:37:24 -0700
Subject: lib: add idr_remove_all

Remove all ids from the given idr tree.  idr_destroy() only frees up
unused, cached idp_layers, but this function will remove all id mappings
and leave all idp_layers unused.

A typical clean-up sequence for objects stored in an idr tree, will use
idr_for_each() to free all objects, if necessay, then idr_remove_all() to
remove all ids, and idr_destroy() to free up the cached idr_layers.

Signed-off-by: Kristian Hoegsberg <krh@redhat.com>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Dave Airlie <airlied@linux.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/idr.h |  1 +
 lib/idr.c           | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 8442c0bffc06..0edda411959c 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -82,6 +82,7 @@ int idr_for_each(struct idr *idp,
 		 int (*fn)(int id, void *p, void *data), void *data);
 void *idr_replace(struct idr *idp, void *ptr, int id);
 void idr_remove(struct idr *idp, int id);
+void idr_remove_all(struct idr *idp);
 void idr_destroy(struct idr *idp);
 void idr_init(struct idr *idp);
 
diff --git a/lib/idr.c b/lib/idr.c
index d29da6159dae..5ca67b3cfd35 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -390,6 +390,53 @@ void idr_remove(struct idr *idp, int id)
 }
 EXPORT_SYMBOL(idr_remove);
 
+/**
+ * idr_remove_all - remove all ids from the given idr tree
+ * @idp: idr handle
+ *
+ * idr_destroy() only frees up unused, cached idp_layers, but this
+ * function will remove all id mappings and leave all idp_layers
+ * unused.
+ *
+ * A typical clean-up sequence for objects stored in an idr tree, will
+ * use idr_for_each() to free all objects, if necessay, then
+ * idr_remove_all() to remove all ids, and idr_destroy() to free
+ * up the cached idr_layers.
+ */
+void idr_remove_all(struct idr *idp)
+{
+	int n, id, max, error = 0;
+	struct idr_layer *p;
+	struct idr_layer *pa[MAX_LEVEL];
+	struct idr_layer **paa = &pa[0];
+
+	n = idp->layers * IDR_BITS;
+	p = idp->top;
+	max = 1 << n;
+
+	id = 0;
+	while (id < max && !error) {
+		while (n > IDR_BITS && p) {
+			n -= IDR_BITS;
+			*paa++ = p;
+			p = p->ary[(id >> n) & IDR_MASK];
+		}
+
+		id += 1 << n;
+		while (n < fls(id)) {
+			if (p) {
+				memset(p, 0, sizeof *p);
+				free_layer(idp, p);
+			}
+			n += IDR_BITS;
+			p = *--paa;
+		}
+	}
+	idp->top = NULL;
+	idp->layers = 0;
+}
+EXPORT_SYMBOL(idr_remove_all);
+
 /**
  * idr_destroy - release all cached layers within an idr tree
  * idp: idr handle
-- 
cgit v1.3-8-gc7d7


From 18a8bd949d6adb311ea816125ff65050df1f3f6e Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Sun, 15 Jul 2007 23:37:59 -0700
Subject: serial: convert early_uart to earlycon for 8250

Beacuse SERIAL_PORT_DFNS is removed from include/asm-i386/serial.h and
include/asm-x86_64/serial.h.  the serial8250_ports need to be probed late in
serial initializing stage.  the console_init=>serial8250_console_init=>
register_console=>serial8250_console_setup will return -ENDEV, and console
ttyS0 can not be enabled at that time.  need to wait till uart_add_one_port in
drivers/serial/serial_core.c to call register_console to get console ttyS0.
that is too late.

Make early_uart to use early_param, so uart console can be used earlier.  Make
it to be bootconsole with CON_BOOT flag, so can use console handover feature.
and it will switch to corresponding normal serial console automatically.

new command line will be:
	console=uart8250,io,0x3f8,9600n8
	console=uart8250,mmio,0xff5e0000,115200n8
or
	earlycon=uart8250,io,0x3f8,9600n8
	earlycon=uart8250,mmio,0xff5e0000,115200n8

it will print in very early stage:
	Early serial console at I/O port 0x3f8 (options '9600n8')
	console [uart0] enabled
later for console it will print:
	console handover: boot [uart0] -> real [ttyS0]

Signed-off-by: <yinghai.lu@sun.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Gerd Hoffmann <kraxel@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kernel-parameters.txt |  11 +++-
 arch/ia64/kernel/setup.c            |   4 --
 drivers/firmware/pcdp.c             |   6 +-
 drivers/serial/8250.c               |  28 +++------
 drivers/serial/8250_early.c         | 117 ++++++++++++++++++------------------
 drivers/serial/Kconfig              |  14 +++++
 include/asm-i386/fixmap.h           |   2 +
 include/asm-i386/io.h               |   1 +
 include/asm-x86_64/fixmap.h         |   4 +-
 include/asm-x86_64/io.h             |   1 +
 include/linux/console.h             |   2 +
 include/linux/serial.h              |   6 --
 include/linux/serial_8250.h         |   4 ++
 init/main.c                         |   5 +-
 kernel/printk.c                     |  22 +++++++
 15 files changed, 132 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4d880b3d1f35..62aab585d9d7 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -462,13 +462,20 @@ and is between 256 and 4096 characters. It is defined in the file
 			Documentation/networking/netconsole.txt for an
 			alternative.
 
-		uart,io,<addr>[,options]
-		uart,mmio,<addr>[,options]
+		uart[8250],io,<addr>[,options]
+		uart[8250],mmio,<addr>[,options]
 			Start an early, polled-mode console on the 8250/16550
 			UART at the specified I/O port or MMIO address,
 			switching to the matching ttyS device later.  The
 			options are the same as for ttyS, above.
 
+	earlycon=	[KNL] Output early console device and options.
+		uart[8250],io,<addr>[,options]
+		uart[8250],mmio,<addr>[,options]
+			Start an early, polled-mode console on the 8250/16550
+			UART at the specified I/O port or MMIO address.
+			The options are the same as for ttyS, above.
+
 	cpcihp_generic=	[HW,PCI] Generic port I/O CompactPCI driver
 			Format:
 			<first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 188fb73c6845..4d9864cc92c9 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -390,10 +390,6 @@ early_console_setup (char *cmdline)
 	if (!efi_setup_pcdp_console(cmdline))
 		earlycons++;
 #endif
-#ifdef CONFIG_SERIAL_8250_CONSOLE
-	if (!early_serial_console_init(cmdline))
-		earlycons++;
-#endif
 
 	return (earlycons) ? 0 : -1;
 }
diff --git a/drivers/firmware/pcdp.c b/drivers/firmware/pcdp.c
index 2b4b76e8bd72..58e9f8e457f8 100644
--- a/drivers/firmware/pcdp.c
+++ b/drivers/firmware/pcdp.c
@@ -15,6 +15,7 @@
 #include <linux/console.h>
 #include <linux/efi.h>
 #include <linux/serial.h>
+#include <linux/serial_8250.h>
 #include <asm/vga.h>
 #include "pcdp.h"
 
@@ -27,7 +28,7 @@ setup_serial_console(struct pcdp_uart *uart)
 	char parity;
 
 	mmio = (uart->addr.space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY);
-	p += sprintf(p, "console=uart,%s,0x%lx",
+	p += sprintf(p, "uart8250,%s,0x%lx",
 		mmio ? "mmio" : "io", uart->addr.address);
 	if (uart->baud) {
 		p += sprintf(p, ",%lu", uart->baud);
@@ -41,7 +42,8 @@ setup_serial_console(struct pcdp_uart *uart)
 		}
 	}
 
-	return early_serial_console_init(options);
+	add_preferred_console("uart", 8250, &options[9]);
+	return setup_early_serial8250_console(options);
 #else
 	return -ENODEV;
 #endif
diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index c84dab083a85..0b3ec38ae614 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -2514,12 +2514,18 @@ static int __init serial8250_console_setup(struct console *co, char *options)
 	return uart_set_options(port, co, baud, parity, bits, flow);
 }
 
+static int __init serial8250_console_early_setup(void)
+{
+	return serial8250_find_port_for_earlycon();
+}
+
 static struct uart_driver serial8250_reg;
 static struct console serial8250_console = {
 	.name		= "ttyS",
 	.write		= serial8250_console_write,
 	.device		= uart_console_device,
 	.setup		= serial8250_console_setup,
+	.early_setup	= serial8250_console_early_setup,
 	.flags		= CON_PRINTBUFFER,
 	.index		= -1,
 	.data		= &serial8250_reg,
@@ -2533,7 +2539,7 @@ static int __init serial8250_console_init(void)
 }
 console_initcall(serial8250_console_init);
 
-static int __init find_port(struct uart_port *p)
+int serial8250_find_port(struct uart_port *p)
 {
 	int line;
 	struct uart_port *port;
@@ -2546,26 +2552,6 @@ static int __init find_port(struct uart_port *p)
 	return -ENODEV;
 }
 
-int __init serial8250_start_console(struct uart_port *port, char *options)
-{
-	int line;
-
-	line = find_port(port);
-	if (line < 0)
-		return -ENODEV;
-
-	add_preferred_console("ttyS", line, options);
-	printk("Adding console on ttyS%d at %s 0x%lx (options '%s')\n",
-		line, port->iotype == UPIO_MEM ? "MMIO" : "I/O port",
-		port->iotype == UPIO_MEM ? (unsigned long) port->mapbase :
-		    (unsigned long) port->iobase, options);
-	if (!(serial8250_console.flags & CON_ENABLED)) {
-		serial8250_console.flags &= ~CON_PRINTBUFFER;
-		register_console(&serial8250_console);
-	}
-	return line;
-}
-
 #define SERIAL8250_CONSOLE	&serial8250_console
 #else
 #define SERIAL8250_CONSOLE	NULL
diff --git a/drivers/serial/8250_early.c b/drivers/serial/8250_early.c
index 7e511199b4c5..947c20507e1f 100644
--- a/drivers/serial/8250_early.c
+++ b/drivers/serial/8250_early.c
@@ -17,13 +17,11 @@
  * we locate the device directly by its MMIO or I/O port address.
  *
  * The user can specify the device directly, e.g.,
- *	console=uart,io,0x3f8,9600n8
- *	console=uart,mmio,0xff5e0000,115200n8
- * or platform code can call early_uart_console_init() to set
- * the early UART device.
- *
- * After the normal serial driver starts, we try to locate the
- * matching ttyS device and start a console there.
+ *	earlycon=uart8250,io,0x3f8,9600n8
+ *	earlycon=uart8250,mmio,0xff5e0000,115200n8
+ * or
+ *	console=uart8250,io,0x3f8,9600n8
+ *	console=uart8250,mmio,0xff5e0000,115200n8
  */
 
 #include <linux/tty.h>
@@ -32,17 +30,21 @@
 #include <linux/serial_core.h>
 #include <linux/serial_reg.h>
 #include <linux/serial.h>
+#include <linux/serial_8250.h>
 #include <asm/io.h>
 #include <asm/serial.h>
+#ifdef CONFIG_FIX_EARLYCON_MEM
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#endif
 
-struct early_uart_device {
+struct early_serial8250_device {
 	struct uart_port port;
 	char options[16];		/* e.g., 115200n8 */
 	unsigned int baud;
 };
 
-static struct early_uart_device early_device __initdata;
-static int early_uart_registered __initdata;
+static struct early_serial8250_device early_device;
 
 static unsigned int __init serial_in(struct uart_port *port, int offset)
 {
@@ -80,7 +82,7 @@ static void __init putc(struct uart_port *port, int c)
 	serial_out(port, UART_TX, c);
 }
 
-static void __init early_uart_write(struct console *console, const char *s, unsigned int count)
+static void __init early_serial8250_write(struct console *console, const char *s, unsigned int count)
 {
 	struct uart_port *port = &early_device.port;
 	unsigned int ier;
@@ -111,7 +113,7 @@ static unsigned int __init probe_baud(struct uart_port *port)
 	return (port->uartclk / 16) / quot;
 }
 
-static void __init init_port(struct early_uart_device *device)
+static void __init init_port(struct early_serial8250_device *device)
 {
 	struct uart_port *port = &device->port;
 	unsigned int divisor;
@@ -130,10 +132,9 @@ static void __init init_port(struct early_uart_device *device)
 	serial_out(port, UART_LCR, c & ~UART_LCR_DLAB);
 }
 
-static int __init parse_options(struct early_uart_device *device, char *options)
+static int __init parse_options(struct early_serial8250_device *device, char *options)
 {
 	struct uart_port *port = &device->port;
-	int mapsize = 64;
 	int mmio, length;
 
 	if (!options)
@@ -143,12 +144,18 @@ static int __init parse_options(struct early_uart_device *device, char *options)
 	if (!strncmp(options, "mmio,", 5)) {
 		port->iotype = UPIO_MEM;
 		port->mapbase = simple_strtoul(options + 5, &options, 0);
-		port->membase = ioremap(port->mapbase, mapsize);
+#ifdef CONFIG_FIX_EARLYCON_MEM
+		set_fixmap_nocache(FIX_EARLYCON_MEM_BASE, port->mapbase & PAGE_MASK);
+		port->membase = (void __iomem *)__fix_to_virt(FIX_EARLYCON_MEM_BASE);
+		port->membase += port->mapbase & ~PAGE_MASK;
+#else
+		port->membase = ioremap(port->mapbase, 64);
 		if (!port->membase) {
 			printk(KERN_ERR "%s: Couldn't ioremap 0x%lx\n",
 				__FUNCTION__, port->mapbase);
 			return -ENOMEM;
 		}
+#endif
 		mmio = 1;
 	} else if (!strncmp(options, "io,", 3)) {
 		port->iotype = UPIO_PORT;
@@ -175,9 +182,16 @@ static int __init parse_options(struct early_uart_device *device, char *options)
 	return 0;
 }
 
-static int __init early_uart_setup(struct console *console, char *options)
+static struct console early_serial8250_console __initdata = {
+	.name	= "uart",
+	.write	= early_serial8250_write,
+	.flags	= CON_PRINTBUFFER | CON_BOOT,
+	.index	= -1,
+};
+
+static int __init early_serial8250_setup(char *options)
 {
-	struct early_uart_device *device = &early_device;
+	struct early_serial8250_device *device = &early_device;
 	int err;
 
 	if (device->port.membase || device->port.iobase)
@@ -190,61 +204,48 @@ static int __init early_uart_setup(struct console *console, char *options)
 	return 0;
 }
 
-static struct console early_uart_console __initdata = {
-	.name	= "uart",
-	.write	= early_uart_write,
-	.setup	= early_uart_setup,
-	.flags	= CON_PRINTBUFFER,
-	.index	= -1,
-};
-
-static int __init early_uart_console_init(void)
-{
-	if (!early_uart_registered) {
-		register_console(&early_uart_console);
-		early_uart_registered = 1;
-	}
-	return 0;
-}
-console_initcall(early_uart_console_init);
-
-int __init early_serial_console_init(char *cmdline)
+int __init setup_early_serial8250_console(char *cmdline)
 {
 	char *options;
 	int err;
 
-	options = strstr(cmdline, "console=uart,");
-	if (!options)
-		return -ENODEV;
+	options = strstr(cmdline, "uart8250,");
+	if (!options) {
+		options = strstr(cmdline, "uart,");
+		if (!options)
+			return 0;
+	}
 
 	options = strchr(cmdline, ',') + 1;
-	if ((err = early_uart_setup(NULL, options)) < 0)
+	if ((err = early_serial8250_setup(options)) < 0)
 		return err;
-	return early_uart_console_init();
+
+	register_console(&early_serial8250_console);
+
+	return 0;
 }
 
-static int __init early_uart_console_switch(void)
+int __init serial8250_find_port_for_earlycon(void)
 {
-	struct early_uart_device *device = &early_device;
+	struct early_serial8250_device *device = &early_device;
 	struct uart_port *port = &device->port;
-	int mmio, line;
+	int line;
+	int ret;
 
-	if (!(early_uart_console.flags & CON_ENABLED))
-		return 0;
+	if (!device->port.membase && !device->port.iobase)
+		return -ENODEV;
 
-	/* Try to start the normal driver on a matching line.  */
-	mmio = (port->iotype == UPIO_MEM);
-	line = serial8250_start_console(port, device->options);
+	line = serial8250_find_port(port);
 	if (line < 0)
-		printk("No ttyS device at %s 0x%lx for console\n",
-			mmio ? "MMIO" : "I/O port",
-			mmio ? port->mapbase :
-			    (unsigned long) port->iobase);
+		return -ENODEV;
 
-	unregister_console(&early_uart_console);
-	if (mmio)
-		iounmap(port->membase);
+	ret = update_console_cmdline("uart", 8250,
+			     "ttyS", line, device->options);
+	if (ret < 0)
+		ret = update_console_cmdline("uart", 0,
+				     "ttyS", line, device->options);
 
-	return 0;
+	return ret;
 }
-late_initcall(early_uart_console_switch);
+
+early_param("earlycon", setup_early_serial8250_console);
diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 2adbed4e10f3..cab42cbd920d 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -62,8 +62,22 @@ config SERIAL_8250_CONSOLE
 	  kernel will automatically use the first serial line, /dev/ttyS0, as
 	  system console.
 
+	  you can set that using a kernel command line option such as
+	  "console=uart8250,io,0x3f8,9600n8"
+	  "console=uart8250,mmio,0xff5e0000,115200n8".
+	  and it will switch to normal serial console when correponding port is
+	  ready.
+	  "earlycon=uart8250,io,0x3f8,9600n8"
+	  "earlycon=uart8250,mmio,0xff5e0000,115200n8".
+	  it will not only setup early console.
+
 	  If unsure, say N.
 
+config FIX_EARLYCON_MEM
+	bool
+	depends on X86
+	default y
+
 config SERIAL_8250_GSC
 	tristate
 	depends on SERIAL_8250 && GSC
diff --git a/include/asm-i386/fixmap.h b/include/asm-i386/fixmap.h
index 80ea052ee3a4..249e753ac805 100644
--- a/include/asm-i386/fixmap.h
+++ b/include/asm-i386/fixmap.h
@@ -54,6 +54,8 @@ extern unsigned long __FIXADDR_TOP;
 enum fixed_addresses {
 	FIX_HOLE,
 	FIX_VDSO,
+	FIX_DBGP_BASE,
+	FIX_EARLYCON_MEM_BASE,
 #ifdef CONFIG_X86_LOCAL_APIC
 	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
 #endif
diff --git a/include/asm-i386/io.h b/include/asm-i386/io.h
index e797586a5bfc..7b65b5b00034 100644
--- a/include/asm-i386/io.h
+++ b/include/asm-i386/io.h
@@ -129,6 +129,7 @@ extern void iounmap(volatile void __iomem *addr);
  */
 extern void *bt_ioremap(unsigned long offset, unsigned long size);
 extern void bt_iounmap(void *addr, unsigned long size);
+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
 
 /* Use early IO mappings for DMI because it's initialized early */
 #define dmi_ioremap bt_ioremap
diff --git a/include/asm-x86_64/fixmap.h b/include/asm-x86_64/fixmap.h
index e90e1677531b..2acb9b7f6418 100644
--- a/include/asm-x86_64/fixmap.h
+++ b/include/asm-x86_64/fixmap.h
@@ -35,6 +35,8 @@ enum fixed_addresses {
 	VSYSCALL_LAST_PAGE,
 	VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
 	VSYSCALL_HPET,
+	FIX_DBGP_BASE,
+	FIX_EARLYCON_MEM_BASE,
 	FIX_HPET_BASE,
 	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
 	FIX_IO_APIC_BASE_0,
@@ -84,7 +86,7 @@ static __always_inline unsigned long fix_to_virt(const unsigned int idx)
 	if (idx >= __end_of_fixed_addresses)
 		__this_fixmap_does_not_exist();
 
-        return __fix_to_virt(idx);
+	return __fix_to_virt(idx);
 }
 
 #endif
diff --git a/include/asm-x86_64/io.h b/include/asm-x86_64/io.h
index de2cd9a2303a..7475095c5061 100644
--- a/include/asm-x86_64/io.h
+++ b/include/asm-x86_64/io.h
@@ -144,6 +144,7 @@ extern void early_iounmap(void *addr, unsigned long size);
  */
 extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
 extern void iounmap(volatile void __iomem *addr);
+extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
 
 /*
  * ISA I/O bus memory addresses are 1:1 with the physical address.
diff --git a/include/linux/console.h b/include/linux/console.h
index 62ef6e11d0d2..c44d3dfde7a5 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -99,6 +99,7 @@ struct console {
 	struct tty_driver *(*device)(struct console *, int *);
 	void	(*unblank)(void);
 	int	(*setup)(struct console *, char *);
+	int	(*early_setup)(void);
 	short	flags;
 	short	index;
 	int	cflag;
@@ -107,6 +108,7 @@ struct console {
 };
 
 extern int add_preferred_console(char *name, int idx, char *options);
+extern int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options);
 extern void register_console(struct console *);
 extern int unregister_console(struct console *);
 extern struct console *console_drivers;
diff --git a/include/linux/serial.h b/include/linux/serial.h
index 33fc8cb8ddfb..deb714314fb1 100644
--- a/include/linux/serial.h
+++ b/include/linux/serial.h
@@ -177,11 +177,5 @@ struct serial_icounter_struct {
 #ifdef __KERNEL__
 #include <linux/compiler.h>
 
-/* Allow architectures to override entries in serial8250_ports[] at run time: */
-struct uart_port;	/* forward declaration */
-extern int early_serial_setup(struct uart_port *port);
-extern int early_serial_console_init(char *options);
-extern int serial8250_start_console(struct uart_port *port, char *options);
-
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SERIAL_H */
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 71310d80c09a..706ee9a4c80c 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -60,4 +60,8 @@ void serial8250_unregister_port(int line);
 void serial8250_suspend_port(int line);
 void serial8250_resume_port(int line);
 
+extern int serial8250_find_port(struct uart_port *p);
+extern int serial8250_find_port_for_earlycon(void);
+extern int setup_early_serial8250_console(char *cmdline);
+
 #endif
diff --git a/init/main.c b/init/main.c
index 0eb1c7463fe4..038c7cb52390 100644
--- a/init/main.c
+++ b/init/main.c
@@ -453,7 +453,10 @@ static int __init do_early_param(char *param, char *val)
 	struct obs_kernel_param *p;
 
 	for (p = __setup_start; p < __setup_end; p++) {
-		if (p->early && strcmp(param, p->str) == 0) {
+		if ((p->early && strcmp(param, p->str) == 0) ||
+		    (strcmp(param, "console") == 0 &&
+		     strcmp(p->str, "earlycon") == 0)
+		) {
 			if (p->setup_func(val) != 0)
 				printk(KERN_WARNING
 				       "Malformed early option '%s'\n", param);
diff --git a/kernel/printk.c b/kernel/printk.c
index 7ce9a8c13312..f46cc6dff46e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -726,6 +726,25 @@ int __init add_preferred_console(char *name, int idx, char *options)
 	return 0;
 }
 
+int __init update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
+{
+	struct console_cmdline *c;
+	int i;
+
+	for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
+		if (strcmp(console_cmdline[i].name, name) == 0 &&
+			  console_cmdline[i].index == idx) {
+				c = &console_cmdline[i];
+				memcpy(c->name, name_new, sizeof(c->name));
+				c->name[sizeof(c->name) - 1] = 0;
+				c->options = options;
+				c->index = idx_new;
+				return i;
+		}
+	/* not found */
+	return -1;
+}
+
 #ifndef CONFIG_DISABLE_CONSOLE_SUSPEND
 /**
  * suspend_console - suspend the console subsystem
@@ -942,6 +961,9 @@ void register_console(struct console *console)
 	if (preferred_console < 0 || bootconsole || !console_drivers)
 		preferred_console = selected_console;
 
+	if (console->early_setup)
+		console->early_setup();
+
 	/*
 	 *	See if we want to use this console driver. If we
 	 *	didn't select a console we take the first one
-- 
cgit v1.3-8-gc7d7


From f0c0b2b808f232741eadac272bd4bc51f18df0f4 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Sun, 15 Jul 2007 23:38:01 -0700
Subject: change zonelist order: zonelist order selection logic

Make zonelist creation policy selectable from sysctl/boot option v6.

This patch makes NUMA's zonelist (of pgdat) order selectable.
Available order are Default(automatic)/ Node-based / Zone-based.

[Default Order]
The kernel selects Node-based or Zone-based order automatically.

[Node-based Order]
This policy treats the locality of memory as the most important parameter.
Zonelist order is created by each zone's locality. This means lower zones
(ex. ZONE_DMA) can be used before higher zone (ex. ZONE_NORMAL) exhausion.
IOW. ZONE_DMA will be in the middle of zonelist.
current 2.6.21 kernel uses this.

Pros.
 * A user can expect local memory as much as possible.
Cons.
 * lower zone will be exhansted before higher zone. This may cause OOM_KILL.

Maybe suitable if ZONE_DMA is relatively big and you never see OOM_KILL
because of ZONE_DMA exhaution and you need the best locality.

(example)
assume 2 node NUMA. node(0) has ZONE_DMA/ZONE_NORMAL, node(1) has ZONE_NORMAL.

*node(0)'s memory allocation order:

 node(0)'s NORMAL -> node(0)'s DMA -> node(1)'s NORMAL.

*node(1)'s memory allocation order:

 node(1)'s NORMAL -> node(0)'s NORMAL -> node(0)'s DMA.

[Zone-based order]
This policy treats the zone type as the most important parameter.
Zonelist order is created by zone-type order. This means lower zone
never be used bofere higher zone exhaustion.
IOW. ZONE_DMA will be always at the tail of zonelist.

Pros.
 * OOM_KILL(bacause of lower zone) occurs only if the whole zones are exhausted.
Cons.
 * memory locality may not be best.

(example)
assume 2 node NUMA. node(0) has ZONE_DMA/ZONE_NORMAL, node(1) has ZONE_NORMAL.

*node(0)'s memory allocation order:

 node(0)'s NORMAL -> node(1)'s NORMAL -> node(0)'s DMA.

*node(1)'s memory allocation order:

 node(1)'s NORMAL -> node(0)'s NORMAL -> node(0)'s DMA.

bootoption "numa_zonelist_order=" and proc/sysctl is supporetd.

command:
%echo N > /proc/sys/vm/numa_zonelist_order

Will rebuild zonelist in Node-based order.

command:
%echo Z > /proc/sys/vm/numa_zonelist_order

Will rebuild zonelist in Zone-based order.

Thanks to Lee Schermerhorn, he gives me much help and codes.

[Lee.Schermerhorn@hp.com: add check_highest_zone to build_zonelists_in_zone_order]
[akpm@linux-foundation.org: build fix]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@suse.de>
Cc: "jesse.barnes@intel.com" <jesse.barnes@intel.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kernel-parameters.txt |   5 +
 Documentation/sysctl/vm.txt         |  45 ++++++
 include/linux/mmzone.h              |   5 +
 kernel/sysctl.c                     |  11 ++
 mm/page_alloc.c                     | 273 +++++++++++++++++++++++++++++++++---
 5 files changed, 317 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 62aab585d9d7..4344f69ae24a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1196,6 +1196,11 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	nowb		[ARM]
 
+	numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
+			one of ['zone', 'node', 'default'] can be specified
+			This can be set from sysctl after boot.
+			See Documentation/sysctl/vm.txt for details.
+
 	nr_uarts=	[SERIAL] maximum number of UARTs to be registered.
 
 	opl3=		[HW,OSS]
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 8cfca173d4bc..df3ff2095f9d 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -32,6 +32,7 @@ Currently, these files are in /proc/sys/vm:
 - min_slab_ratio
 - panic_on_oom
 - mmap_min_address
+- numa_zonelist_order
 
 ==============================================================
 
@@ -231,3 +232,47 @@ security module.  Setting this value to something like 64k will allow the
 vast majority of applications to work correctly and provide defense in depth
 against future potential kernel bugs.
 
+==============================================================
+
+numa_zonelist_order
+
+This sysctl is only for NUMA.
+'where the memory is allocated from' is controlled by zonelists.
+(This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation.
+ you may be able to read ZONE_DMA as ZONE_DMA32...)
+
+In non-NUMA case, a zonelist for GFP_KERNEL is ordered as following.
+ZONE_NORMAL -> ZONE_DMA
+This means that a memory allocation request for GFP_KERNEL will
+get memory from ZONE_DMA only when ZONE_NORMAL is not available.
+
+In NUMA case, you can think of following 2 types of order.
+Assume 2 node NUMA and below is zonelist of Node(0)'s GFP_KERNEL
+
+(A) Node(0) ZONE_NORMAL -> Node(0) ZONE_DMA -> Node(1) ZONE_NORMAL
+(B) Node(0) ZONE_NORMAL -> Node(1) ZONE_NORMAL -> Node(0) ZONE_DMA.
+
+Type(A) offers the best locality for processes on Node(0), but ZONE_DMA
+will be used before ZONE_NORMAL exhaustion. This increases possibility of
+out-of-memory(OOM) of ZONE_DMA because ZONE_DMA is tend to be small.
+
+Type(B) cannot offer the best locality but is more robust against OOM of
+the DMA zone.
+
+Type(A) is called as "Node" order. Type (B) is "Zone" order.
+
+"Node order" orders the zonelists by node, then by zone within each node.
+Specify "[Nn]ode" for zone order
+
+"Zone Order" orders the zonelists by zone type, then by node within each
+zone.  Specify "[Zz]one"for zode order.
+
+Specify "[Dd]efault" to request automatic configuration.  Autoconfiguration
+will select "node" order in following case.
+(1) if the DMA zone does not exist or
+(2) if the DMA zone comprises greater than 50% of the available memory or
+(3) if any node's DMA zone comprises greater than 60% of its local memory and
+    the amount of local memory is big enough.
+
+Otherwise, "zone" order will be selected. Default order is recommended unless
+this is causing problems for your system/application.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d09b1345a3a1..04b1636a970b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -566,6 +566,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
 			struct file *, void __user *, size_t *, loff_t *);
 
+extern int numa_zonelist_order_handler(struct ctl_table *, int,
+			struct file *, void __user *, size_t *, loff_t *);
+extern char numa_zonelist_order[];
+#define NUMA_ZONELIST_ORDER_LEN 16	/* string buffer size */
+
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
 #ifndef numa_node_id
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d93e13d93f24..ccaebbbd75ae 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -958,6 +958,17 @@ static ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_doulongvec_minmax,
 	},
+#ifdef CONFIG_NUMA
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "numa_zonelist_order",
+		.data		= &numa_zonelist_order,
+		.maxlen		= NUMA_ZONELIST_ORDER_LEN,
+		.mode		= 0644,
+		.proc_handler	= &numa_zonelist_order_handler,
+		.strategy	= &sysctl_string,
+	},
+#endif
 #endif
 #if defined(CONFIG_X86_32) || \
    (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 05ace44852eb..092b2d8f2f0c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1621,8 +1621,8 @@ void show_free_areas(void)
  *
  * Add all populated zones of a node to the zonelist.
  */
-static int __meminit build_zonelists_node(pg_data_t *pgdat,
-			struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
+static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
+				int nr_zones, enum zone_type zone_type)
 {
 	struct zone *zone;
 
@@ -1641,9 +1641,102 @@ static int __meminit build_zonelists_node(pg_data_t *pgdat,
 	return nr_zones;
 }
 
+
+/*
+ *  zonelist_order:
+ *  0 = automatic detection of better ordering.
+ *  1 = order by ([node] distance, -zonetype)
+ *  2 = order by (-zonetype, [node] distance)
+ *
+ *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
+ *  the same zonelist. So only NUMA can configure this param.
+ */
+#define ZONELIST_ORDER_DEFAULT  0
+#define ZONELIST_ORDER_NODE     1
+#define ZONELIST_ORDER_ZONE     2
+
+/* zonelist order in the kernel.
+ * set_zonelist_order() will set this to NODE or ZONE.
+ */
+static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
+static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
+
+
 #ifdef CONFIG_NUMA
+/* The value user specified ....changed by config */
+static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
+/* string for sysctl */
+#define NUMA_ZONELIST_ORDER_LEN	16
+char numa_zonelist_order[16] = "default";
+
+/*
+ * interface for configure zonelist ordering.
+ * command line option "numa_zonelist_order"
+ *	= "[dD]efault	- default, automatic configuration.
+ *	= "[nN]ode 	- order by node locality, then by zone within node
+ *	= "[zZ]one      - order by zone, then by locality within zone
+ */
+
+static int __parse_numa_zonelist_order(char *s)
+{
+	if (*s == 'd' || *s == 'D') {
+		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
+	} else if (*s == 'n' || *s == 'N') {
+		user_zonelist_order = ZONELIST_ORDER_NODE;
+	} else if (*s == 'z' || *s == 'Z') {
+		user_zonelist_order = ZONELIST_ORDER_ZONE;
+	} else {
+		printk(KERN_WARNING
+			"Ignoring invalid numa_zonelist_order value:  "
+			"%s\n", s);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static __init int setup_numa_zonelist_order(char *s)
+{
+	if (s)
+		return __parse_numa_zonelist_order(s);
+	return 0;
+}
+early_param("numa_zonelist_order", setup_numa_zonelist_order);
+
+/*
+ * sysctl handler for numa_zonelist_order
+ */
+int numa_zonelist_order_handler(ctl_table *table, int write,
+		struct file *file, void __user *buffer, size_t *length,
+		loff_t *ppos)
+{
+	char saved_string[NUMA_ZONELIST_ORDER_LEN];
+	int ret;
+
+	if (write)
+		strncpy(saved_string, (char*)table->data,
+			NUMA_ZONELIST_ORDER_LEN);
+	ret = proc_dostring(table, write, file, buffer, length, ppos);
+	if (ret)
+		return ret;
+	if (write) {
+		int oldval = user_zonelist_order;
+		if (__parse_numa_zonelist_order((char*)table->data)) {
+			/*
+			 * bogus value.  restore saved string
+			 */
+			strncpy((char*)table->data, saved_string,
+				NUMA_ZONELIST_ORDER_LEN);
+			user_zonelist_order = oldval;
+		} else if (oldval != user_zonelist_order)
+			build_all_zonelists();
+	}
+	return 0;
+}
+
+
 #define MAX_NODE_LOAD (num_online_nodes())
-static int __meminitdata node_load[MAX_NUMNODES];
+static int node_load[MAX_NUMNODES];
+
 /**
  * find_next_best_node - find the next node that should appear in a given node's fallback list
  * @node: node whose fallback list we're appending
@@ -1658,7 +1751,7 @@ static int __meminitdata node_load[MAX_NUMNODES];
  * on them otherwise.
  * It returns -1 if no node is found.
  */
-static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
+static int find_next_best_node(int node, nodemask_t *used_node_mask)
 {
 	int n, val;
 	int min_val = INT_MAX;
@@ -1704,13 +1797,129 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
 	return best_node;
 }
 
-static void __meminit build_zonelists(pg_data_t *pgdat)
+
+/*
+ * Build zonelists ordered by node and zones within node.
+ * This results in maximum locality--normal zone overflows into local
+ * DMA zone, if any--but risks exhausting DMA zone.
+ */
+static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 {
-	int j, node, local_node;
 	enum zone_type i;
-	int prev_node, load;
+	int j;
 	struct zonelist *zonelist;
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		zonelist = pgdat->node_zonelists + i;
+		for (j = 0; zonelist->zones[j] != NULL; j++)
+			;
+ 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
+		zonelist->zones[j] = NULL;
+	}
+}
+
+/*
+ * Build zonelists ordered by zone and nodes within zones.
+ * This results in conserving DMA zone[s] until all Normal memory is
+ * exhausted, but results in overflowing to remote node while memory
+ * may still exist in local DMA zone.
+ */
+static int node_order[MAX_NUMNODES];
+
+static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
+{
+	enum zone_type i;
+	int pos, j, node;
+	int zone_type;		/* needs to be signed */
+	struct zone *z;
+	struct zonelist *zonelist;
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		zonelist = pgdat->node_zonelists + i;
+		pos = 0;
+		for (zone_type = i; zone_type >= 0; zone_type--) {
+			for (j = 0; j < nr_nodes; j++) {
+				node = node_order[j];
+				z = &NODE_DATA(node)->node_zones[zone_type];
+				if (populated_zone(z)) {
+					zonelist->zones[pos++] = z;
+					check_highest_zone(zone_type);
+				}
+			}
+		}
+		zonelist->zones[pos] = NULL;
+	}
+}
+
+static int default_zonelist_order(void)
+{
+	int nid, zone_type;
+	unsigned long low_kmem_size,total_size;
+	struct zone *z;
+	int average_size;
+	/*
+         * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
+	 * If they are really small and used heavily, the system can fall
+	 * into OOM very easily.
+	 * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
+	 */
+	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
+	low_kmem_size = 0;
+	total_size = 0;
+	for_each_online_node(nid) {
+		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+			z = &NODE_DATA(nid)->node_zones[zone_type];
+			if (populated_zone(z)) {
+				if (zone_type < ZONE_NORMAL)
+					low_kmem_size += z->present_pages;
+				total_size += z->present_pages;
+			}
+		}
+	}
+	if (!low_kmem_size ||  /* there are no DMA area. */
+	    low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
+		return ZONELIST_ORDER_NODE;
+	/*
+	 * look into each node's config.
+  	 * If there is a node whose DMA/DMA32 memory is very big area on
+ 	 * local memory, NODE_ORDER may be suitable.
+         */
+	average_size = total_size / (num_online_nodes() + 1);
+	for_each_online_node(nid) {
+		low_kmem_size = 0;
+		total_size = 0;
+		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+			z = &NODE_DATA(nid)->node_zones[zone_type];
+			if (populated_zone(z)) {
+				if (zone_type < ZONE_NORMAL)
+					low_kmem_size += z->present_pages;
+				total_size += z->present_pages;
+			}
+		}
+		if (low_kmem_size &&
+		    total_size > average_size && /* ignore small node */
+		    low_kmem_size > total_size * 70/100)
+			return ZONELIST_ORDER_NODE;
+	}
+	return ZONELIST_ORDER_ZONE;
+}
+
+static void set_zonelist_order(void)
+{
+	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
+		current_zonelist_order = default_zonelist_order();
+	else
+		current_zonelist_order = user_zonelist_order;
+}
+
+static void build_zonelists(pg_data_t *pgdat)
+{
+	int j, node, load;
+	enum zone_type i;
 	nodemask_t used_mask;
+	int local_node, prev_node;
+	struct zonelist *zonelist;
+	int order = current_zonelist_order;
 
 	/* initialize zonelists */
 	for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -1723,6 +1932,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
 	load = num_online_nodes();
 	prev_node = local_node;
 	nodes_clear(used_mask);
+
+	memset(node_load, 0, sizeof(node_load));
+	memset(node_order, 0, sizeof(node_order));
+	j = 0;
+
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
 		int distance = node_distance(local_node, node);
 
@@ -1738,23 +1952,25 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
-
 		if (distance != node_distance(local_node, prev_node))
-			node_load[node] += load;
+			node_load[node] = load;
+
 		prev_node = node;
 		load--;
-		for (i = 0; i < MAX_NR_ZONES; i++) {
-			zonelist = pgdat->node_zonelists + i;
-			for (j = 0; zonelist->zones[j] != NULL; j++);
+		if (order == ZONELIST_ORDER_NODE)
+			build_zonelists_in_node_order(pgdat, node);
+		else
+			node_order[j++] = node;	/* remember order */
+	}
 
-	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
-			zonelist->zones[j] = NULL;
-		}
+	if (order == ZONELIST_ORDER_ZONE) {
+		/* calculate node order -- i.e., DMA last! */
+		build_zonelists_in_zone_order(pgdat, j);
 	}
 }
 
 /* Construct the zonelist performance cache - see further mmzone.h */
-static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	int i;
 
@@ -1771,9 +1987,15 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat)
 	}
 }
 
+
 #else	/* CONFIG_NUMA */
 
-static void __meminit build_zonelists(pg_data_t *pgdat)
+static void set_zonelist_order(void)
+{
+	current_zonelist_order = ZONELIST_ORDER_ZONE;
+}
+
+static void build_zonelists(pg_data_t *pgdat)
 {
 	int node, local_node;
 	enum zone_type i,j;
@@ -1809,7 +2031,7 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
 }
 
 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
-static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	int i;
 
@@ -1820,7 +2042,7 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat)
 #endif	/* CONFIG_NUMA */
 
 /* return values int ....just for stop_machine_run() */
-static int __meminit __build_all_zonelists(void *dummy)
+static int __build_all_zonelists(void *dummy)
 {
 	int nid;
 
@@ -1831,8 +2053,10 @@ static int __meminit __build_all_zonelists(void *dummy)
 	return 0;
 }
 
-void __meminit build_all_zonelists(void)
+void build_all_zonelists(void)
 {
+	set_zonelist_order();
+
 	if (system_state == SYSTEM_BOOTING) {
 		__build_all_zonelists(NULL);
 		cpuset_init_current_mems_allowed();
@@ -1843,8 +2067,13 @@ void __meminit build_all_zonelists(void)
 		/* cpuset refresh routine should be here */
 	}
 	vm_total_pages = nr_free_pagecache_pages();
-	printk("Built %i zonelists.  Total pages: %ld\n",
-			num_online_nodes(), vm_total_pages);
+	printk("Built %i zonelists in %s order.  Total pages: %ld\n",
+			num_online_nodes(),
+			zonelist_order_name[current_zonelist_order],
+			vm_total_pages);
+#ifdef CONFIG_NUMA
+	printk("Policy zone: %s\n", zone_names[policy_zone]);
+#endif
 }
 
 /*
-- 
cgit v1.3-8-gc7d7


From 698827fa9f45019df1609bb686bc51c94e127fbc Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@mindspring.com>
Date: Sun, 15 Jul 2007 23:38:06 -0700
Subject: Remove the deprecated "kmem_cache_t" typedef from slab.h.

Given that there is no remaining usage of the deprecated kmem_cache_t
typedef anywhere in the tree, remove that typedef.

Signed-off-by: Robert P. J. Day <rpjday@mindspring.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index cebcd3833c76..cd6ab658553f 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -14,8 +14,6 @@
 #include <linux/gfp.h>
 #include <linux/types.h>
 
-typedef struct kmem_cache kmem_cache_t __deprecated;
-
 /*
  * Flags to pass to kmem_cache_create().
  * The ones marked DEBUG are only valid if CONFIG_SLAB_DEBUG is set.
-- 
cgit v1.3-8-gc7d7


From fc9a07e7bf1a76e710f5df017abb07628db1781d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Sun, 15 Jul 2007 23:38:14 -0700
Subject: invalidate_mapping_pages(): add cond_resched

invalidate_mapping_pages() can sometimes take a long time (millions of pages
to free).  Long enough for the softlockup detector to trigger.

We used to have a cond_resched() in there but I took it out because the
drop_caches code calls invalidate_mapping_pages() under inode_lock.

The patch adds a nasty flag and puts the cond_resched() back.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/drop_caches.c   |  2 +-
 include/linux/fs.h |  3 +++
 mm/truncate.c      | 38 +++++++++++++++++++++++---------------
 3 files changed, 27 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 03ea7696fe39..59375efcf39d 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -20,7 +20,7 @@ static void drop_pagecache_sb(struct super_block *sb)
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		if (inode->i_state & (I_FREEING|I_WILL_FREE))
 			continue;
-		invalidate_mapping_pages(inode->i_mapping, 0, -1);
+		__invalidate_mapping_pages(inode->i_mapping, 0, -1, true);
 	}
 	spin_unlock(&inode_lock);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4f0b3bf5983c..51c938a71dec 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1610,6 +1610,9 @@ extern int __invalidate_device(struct block_device *);
 extern int invalidate_partition(struct gendisk *, int);
 #endif
 extern int invalidate_inodes(struct super_block *);
+unsigned long __invalidate_mapping_pages(struct address_space *mapping,
+					pgoff_t start, pgoff_t end,
+					bool be_atomic);
 unsigned long invalidate_mapping_pages(struct address_space *mapping,
 					pgoff_t start, pgoff_t end);
 
diff --git a/mm/truncate.c b/mm/truncate.c
index af3dcf0e48e6..7c994f2d6145 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -253,21 +253,8 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 }
 EXPORT_SYMBOL(truncate_inode_pages);
 
-/**
- * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
- * @mapping: the address_space which holds the pages to invalidate
- * @start: the offset 'from' which to invalidate
- * @end: the offset 'to' which to invalidate (inclusive)
- *
- * This function only removes the unlocked pages, if you want to
- * remove all the pages of one inode, you must call truncate_inode_pages.
- *
- * invalidate_mapping_pages() will not block on IO activity. It will not
- * invalidate pages which are dirty, locked, under writeback or mapped into
- * pagetables.
- */
-unsigned long invalidate_mapping_pages(struct address_space *mapping,
-				pgoff_t start, pgoff_t end)
+unsigned long __invalidate_mapping_pages(struct address_space *mapping,
+				pgoff_t start, pgoff_t end, bool be_atomic)
 {
 	struct pagevec pvec;
 	pgoff_t next = start;
@@ -308,9 +295,30 @@ unlock:
 				break;
 		}
 		pagevec_release(&pvec);
+		if (likely(!be_atomic))
+			cond_resched();
 	}
 	return ret;
 }
+
+/**
+ * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
+ * @mapping: the address_space which holds the pages to invalidate
+ * @start: the offset 'from' which to invalidate
+ * @end: the offset 'to' which to invalidate (inclusive)
+ *
+ * This function only removes the unlocked pages, if you want to
+ * remove all the pages of one inode, you must call truncate_inode_pages.
+ *
+ * invalidate_mapping_pages() will not block on IO activity. It will not
+ * invalidate pages which are dirty, locked, under writeback or mapped into
+ * pagetables.
+ */
+unsigned long invalidate_mapping_pages(struct address_space *mapping,
+				pgoff_t start, pgoff_t end)
+{
+	return __invalidate_mapping_pages(mapping, start, end, false);
+}
 EXPORT_SYMBOL(invalidate_mapping_pages);
 
 /*
-- 
cgit v1.3-8-gc7d7


From 8f0accc8627043702e6ea2bb8b9aa3a171ef8393 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Sun, 15 Jul 2007 23:38:19 -0700
Subject: kill vmalloc_earlyreserve

This symbol got orphaned quite a while ago.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-i386/pgtable.h | 2 +-
 include/linux/mm.h         | 1 -
 mm/memory.c                | 2 --
 3 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 0efa8063a3e9..01734e05e63b 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -79,7 +79,7 @@ void paging_init(void);
  * area for the same reason. ;)
  */
 #define VMALLOC_OFFSET	(8*1024*1024)
-#define VMALLOC_START	(((unsigned long) high_memory + vmalloc_earlyreserve + \
+#define VMALLOC_START	(((unsigned long) high_memory + \
 			2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
 #ifdef CONFIG_HIGHMEM
 # define VMALLOC_END	(PKMAP_BASE-2*PAGE_SIZE)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1c1207472bb4..bbd427e8741a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -27,7 +27,6 @@ extern unsigned long max_mapnr;
 
 extern unsigned long num_physpages;
 extern void * high_memory;
-extern unsigned long vmalloc_earlyreserve;
 extern int page_cluster;
 
 #ifdef CONFIG_SYSCTL
diff --git a/mm/memory.c b/mm/memory.c
index cfddcd2075b9..b3d73bb1f680 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -78,11 +78,9 @@ unsigned long num_physpages;
  * and ZONE_HIGHMEM.
  */
 void * high_memory;
-unsigned long vmalloc_earlyreserve;
 
 EXPORT_SYMBOL(num_physpages);
 EXPORT_SYMBOL(high_memory);
-EXPORT_SYMBOL(vmalloc_earlyreserve);
 
 int randomize_va_space __read_mostly = 1;
 
-- 
cgit v1.3-8-gc7d7


From 6193a2ff180920f84ee06977165ebf32431fc2d2 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 15 Jul 2007 23:38:22 -0700
Subject: slob: initial NUMA support

This adds preliminary NUMA support to SLOB, primarily aimed at systems with
small nodes (tested all the way down to a 128kB SRAM block), whether
asymmetric or otherwise.

We follow the same conventions as SLAB/SLUB, preferring current node
placement for new pages, or with explicit placement, if a node has been
specified.  Presently on UP NUMA this has the side-effect of preferring
node#0 allocations (since numa_node_id() == 0, though this could be
reworked if we could hand off a pfn to determine node placement), so
single-CPU NUMA systems will want to place smaller nodes further out in
terms of node id.  Once a page has been bound to a node (via explicit node
id typing), we only do block allocations from partial free pages that have
a matching node id in the page flags.

The current implementation does have some scalability problems, in that all
partial free pages are tracked in the global freelist (with contention due
to the single spinlock).  However, these are things that are being reworked
for SMP scalability first, while things like per-node freelists can easily
be built on top of this sort of functionality once it's been added.

More background can be found in:

	http://marc.info/?l=linux-mm&m=118117916022379&w=2
	http://marc.info/?l=linux-mm&m=118170446306199&w=2
	http://marc.info/?l=linux-mm&m=118187859420048&w=2

and subsequent threads.

Acked-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h     | 126 +++++++++++++++++++++++------------------------
 include/linux/slab_def.h |   4 ++
 include/linux/slob_def.h |  46 +++++++++++++++++
 include/linux/slub_def.h |   6 ++-
 mm/slob.c                |  72 ++++++++++++++++++++-------
 5 files changed, 172 insertions(+), 82 deletions(-)
 create mode 100644 include/linux/slob_def.h

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index cd6ab658553f..27402fea9b79 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -42,7 +42,6 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			void (*)(void *, struct kmem_cache *, unsigned long));
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
 void *kmem_cache_zalloc(struct kmem_cache *, gfp_t);
 void kmem_cache_free(struct kmem_cache *, void *);
 unsigned int kmem_cache_size(struct kmem_cache *);
@@ -61,16 +60,6 @@ int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr);
 		sizeof(struct __struct), __alignof__(struct __struct),\
 		(__flags), NULL, NULL)
 
-#ifdef CONFIG_NUMA
-extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
-#else
-static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
-					gfp_t flags, int node)
-{
-	return kmem_cache_alloc(cachep, flags);
-}
-#endif
-
 /*
  * The largest kmalloc size supported by the slab allocators is
  * 32 megabyte (2^25) or the maximum allocatable page order if that is
@@ -89,7 +78,6 @@ static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
 /*
  * Common kmalloc functions provided by all allocators
  */
-void *__kmalloc(size_t, gfp_t);
 void *__kzalloc(size_t, gfp_t);
 void * __must_check krealloc(const void *, size_t, gfp_t);
 void kfree(const void *);
@@ -100,40 +88,6 @@ size_t ksize(const void *);
  * @n: number of elements.
  * @size: element size.
  * @flags: the type of memory to allocate.
- */
-static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
-{
-	if (n != 0 && size > ULONG_MAX / n)
-		return NULL;
-	return __kzalloc(n * size, flags);
-}
-
-/*
- * Allocator specific definitions. These are mainly used to establish optimized
- * ways to convert kmalloc() calls to kmem_cache_alloc() invocations by selecting
- * the appropriate general cache at compile time.
- */
-
-#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
-#ifdef CONFIG_SLUB
-#include <linux/slub_def.h>
-#else
-#include <linux/slab_def.h>
-#endif /* !CONFIG_SLUB */
-#else
-
-/*
- * Fallback definitions for an allocator not wanting to provide
- * its own optimized kmalloc definitions (like SLOB).
- */
-
-/**
- * kmalloc - allocate memory
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- *
- * kmalloc is the normal method of allocating memory
- * in the kernel.
  *
  * The @flags argument may be one of:
  *
@@ -141,7 +95,7 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
  *
  * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
  *
- * %GFP_ATOMIC - Allocation will not sleep.
+ * %GFP_ATOMIC - Allocation will not sleep.  May use emergency pools.
  *   For example, use this inside interrupt handlers.
  *
  * %GFP_HIGHUSER - Allocate pages from high memory.
@@ -150,18 +104,22 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
  *
  * %GFP_NOFS - Do not make any fs calls while trying to get memory.
  *
+ * %GFP_NOWAIT - Allocation will not sleep.
+ *
+ * %GFP_THISNODE - Allocate node-local memory only.
+ *
+ * %GFP_DMA - Allocation suitable for DMA.
+ *   Should only be used for kmalloc() caches. Otherwise, use a
+ *   slab created with SLAB_DMA.
+ *
  * Also it is possible to set different flags by OR'ing
  * in one or more of the following additional @flags:
  *
  * %__GFP_COLD - Request cache-cold pages instead of
  *   trying to return cache-warm pages.
  *
- * %__GFP_DMA - Request memory from the DMA-capable zone.
- *
  * %__GFP_HIGH - This allocation has high priority and may use emergency pools.
  *
- * %__GFP_HIGHMEM - Allocated memory may be from highmem.
- *
  * %__GFP_NOFAIL - Indicate that this allocation is in no way allowed to fail
  *   (think twice before using).
  *
@@ -171,24 +129,57 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
  * %__GFP_NOWARN - If allocation fails, don't issue any warnings.
  *
  * %__GFP_REPEAT - If allocation fails initially, try once more before failing.
+ *
+ * There are other flags available as well, but these are not intended
+ * for general use, and so are not documented here. For a full list of
+ * potential flags, always refer to linux/gfp.h.
  */
-static inline void *kmalloc(size_t size, gfp_t flags)
+static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 {
-	return __kmalloc(size, flags);
+	if (n != 0 && size > ULONG_MAX / n)
+		return NULL;
+	return __kzalloc(n * size, flags);
 }
 
-/**
- * kzalloc - allocate memory. The memory is set to zero.
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate (see kmalloc).
+/*
+ * Allocator specific definitions. These are mainly used to establish optimized
+ * ways to convert kmalloc() calls to kmem_cache_alloc() invocations by
+ * selecting the appropriate general cache at compile time.
+ *
+ * Allocators must define at least:
+ *
+ *	kmem_cache_alloc()
+ *	__kmalloc()
+ *	kmalloc()
+ *	kzalloc()
+ *
+ * Those wishing to support NUMA must also define:
+ *
+ *	kmem_cache_alloc_node()
+ *	kmalloc_node()
+ *
+ * See each allocator definition file for additional comments and
+ * implementation notes.
  */
-static inline void *kzalloc(size_t size, gfp_t flags)
-{
-	return __kzalloc(size, flags);
-}
+#ifdef CONFIG_SLUB
+#include <linux/slub_def.h>
+#elif defined(CONFIG_SLOB)
+#include <linux/slob_def.h>
+#else
+#include <linux/slab_def.h>
 #endif
 
-#ifndef CONFIG_NUMA
+#if !defined(CONFIG_NUMA) && !defined(CONFIG_SLOB)
+/**
+ * kmalloc_node - allocate memory from a specific node
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kcalloc).
+ * @node: node to allocate from.
+ *
+ * kmalloc() for non-local nodes, used to allocate from a specific node
+ * if available. Equivalent to kmalloc() in the non-NUMA single-node
+ * case.
+ */
 static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	return kmalloc(size, flags);
@@ -198,7 +189,15 @@ static inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	return __kmalloc(size, flags);
 }
-#endif /* !CONFIG_NUMA */
+
+void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
+
+static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
+					gfp_t flags, int node)
+{
+	return kmem_cache_alloc(cachep, flags);
+}
+#endif /* !CONFIG_NUMA && !CONFIG_SLOB */
 
 /*
  * kmalloc_track_caller is a special version of kmalloc that records the
@@ -245,4 +244,3 @@ extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, void *);
 
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SLAB_H */
-
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 8d81a60518e4..365d036c454a 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -25,6 +25,9 @@ struct cache_sizes {
 };
 extern struct cache_sizes malloc_sizes[];
 
+void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
+void *__kmalloc(size_t size, gfp_t flags);
+
 static inline void *kmalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size)) {
@@ -79,6 +82,7 @@ found:
 
 #ifdef CONFIG_NUMA
 extern void *__kmalloc_node(size_t size, gfp_t flags, int node);
+extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 
 static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
diff --git a/include/linux/slob_def.h b/include/linux/slob_def.h
new file mode 100644
index 000000000000..a2daf2d418a9
--- /dev/null
+++ b/include/linux/slob_def.h
@@ -0,0 +1,46 @@
+#ifndef __LINUX_SLOB_DEF_H
+#define __LINUX_SLOB_DEF_H
+
+void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
+
+static inline void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+	return kmem_cache_alloc_node(cachep, flags, -1);
+}
+
+void *__kmalloc_node(size_t size, gfp_t flags, int node);
+
+static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+{
+	return __kmalloc_node(size, flags, node);
+}
+
+/**
+ * kmalloc - allocate memory
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kcalloc).
+ *
+ * kmalloc is the normal method of allocating memory
+ * in the kernel.
+ */
+static inline void *kmalloc(size_t size, gfp_t flags)
+{
+	return __kmalloc_node(size, flags, -1);
+}
+
+static inline void *__kmalloc(size_t size, gfp_t flags)
+{
+	return kmalloc(size, flags);
+}
+
+/**
+ * kzalloc - allocate memory. The memory is set to zero.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kcalloc).
+ */
+static inline void *kzalloc(size_t size, gfp_t flags)
+{
+	return __kzalloc(size, flags);
+}
+
+#endif /* __LINUX_SLOB_DEF_H */
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 6207a3d8da71..a582f6771525 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -171,6 +171,9 @@ static inline struct kmem_cache *kmalloc_slab(size_t size)
 #define ZERO_SIZE_PTR ((void *)16)
 
 
+void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
+void *__kmalloc(size_t size, gfp_t flags);
+
 static inline void *kmalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size) && !(flags & SLUB_DMA)) {
@@ -198,7 +201,8 @@ static inline void *kzalloc(size_t size, gfp_t flags)
 }
 
 #ifdef CONFIG_NUMA
-extern void *__kmalloc_node(size_t size, gfp_t flags, int node);
+void *__kmalloc_node(size_t size, gfp_t flags, int node);
+void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 
 static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
diff --git a/mm/slob.c b/mm/slob.c
index 06e5e725fab3..b99b0ef2347e 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -3,6 +3,8 @@
  *
  * Matt Mackall <mpm@selenic.com> 12/30/03
  *
+ * NUMA support by Paul Mundt, 2007.
+ *
  * How SLOB works:
  *
  * The core of SLOB is a traditional K&R style heap allocator, with
@@ -10,7 +12,7 @@
  * allocator is as little as 2 bytes, however typically most architectures
  * will require 4 bytes on 32-bit and 8 bytes on 64-bit.
  *
- * The slob heap is a linked list of pages from __get_free_page, and
+ * The slob heap is a linked list of pages from alloc_pages(), and
  * within each page, there is a singly-linked list of free blocks (slob_t).
  * The heap is grown on demand and allocation from the heap is currently
  * first-fit.
@@ -18,7 +20,7 @@
  * Above this is an implementation of kmalloc/kfree. Blocks returned
  * from kmalloc are prepended with a 4-byte header with the kmalloc size.
  * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
- * __get_free_pages directly, allocating compound pages so the page order
+ * alloc_pages() directly, allocating compound pages so the page order
  * does not have to be separately tracked, and also stores the exact
  * allocation size in page->private so that it can be used to accurately
  * provide ksize(). These objects are detected in kfree() because slob_page()
@@ -29,10 +31,23 @@
  * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which
  * case the low-level allocator will fragment blocks to create the proper
  * alignment. Again, objects of page-size or greater are allocated by
- * calling __get_free_pages. As SLAB objects know their size, no separate
+ * calling alloc_pages(). As SLAB objects know their size, no separate
  * size bookkeeping is necessary and there is essentially no allocation
  * space overhead, and compound pages aren't needed for multi-page
  * allocations.
+ *
+ * NUMA support in SLOB is fairly simplistic, pushing most of the real
+ * logic down to the page allocator, and simply doing the node accounting
+ * on the upper levels. In the event that a node id is explicitly
+ * provided, alloc_pages_node() with the specified node id is used
+ * instead. The common case (or when the node id isn't explicitly provided)
+ * will default to the current node, as per numa_node_id().
+ *
+ * Node aware pages are still inserted in to the global freelist, and
+ * these are scanned for by matching against the node id encoded in the
+ * page flags. As a result, block allocations that can be satisfied from
+ * the freelist will only be done so on pages residing on the same node,
+ * in order to prevent random node placement.
  */
 
 #include <linux/kernel.h>
@@ -204,6 +219,23 @@ static int slob_last(slob_t *s)
 	return !((unsigned long)slob_next(s) & ~PAGE_MASK);
 }
 
+static void *slob_new_page(gfp_t gfp, int order, int node)
+{
+	void *page;
+
+#ifdef CONFIG_NUMA
+	if (node != -1)
+		page = alloc_pages_node(node, gfp, order);
+	else
+#endif
+		page = alloc_pages(gfp, order);
+
+	if (!page)
+		return NULL;
+
+	return page_address(page);
+}
+
 /*
  * Allocate a slob block within a given slob_page sp.
  */
@@ -258,7 +290,7 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
 /*
  * slob_alloc: entry point into the slob allocator.
  */
-static void *slob_alloc(size_t size, gfp_t gfp, int align)
+static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 {
 	struct slob_page *sp;
 	slob_t *b = NULL;
@@ -267,6 +299,15 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align)
 	spin_lock_irqsave(&slob_lock, flags);
 	/* Iterate through each partially free page, try to find room */
 	list_for_each_entry(sp, &free_slob_pages, list) {
+#ifdef CONFIG_NUMA
+		/*
+		 * If there's a node specification, search for a partial
+		 * page with a matching node id in the freelist.
+		 */
+		if (node != -1 && page_to_nid(&sp->page) != node)
+			continue;
+#endif
+
 		if (sp->units >= SLOB_UNITS(size)) {
 			b = slob_page_alloc(sp, size, align);
 			if (b)
@@ -277,7 +318,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align)
 
 	/* Not enough space: must allocate a new page */
 	if (!b) {
-		b = (slob_t *)__get_free_page(gfp);
+		b = slob_new_page(gfp, 0, node);
 		if (!b)
 			return 0;
 		sp = (struct slob_page *)virt_to_page(b);
@@ -381,22 +422,20 @@ out:
 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
 #endif
 
-
-void *__kmalloc(size_t size, gfp_t gfp)
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 {
 	int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
 
 	if (size < PAGE_SIZE - align) {
 		unsigned int *m;
-		m = slob_alloc(size + align, gfp, align);
+		m = slob_alloc(size + align, gfp, align, node);
 		if (m)
 			*m = size;
 		return (void *)m + align;
 	} else {
 		void *ret;
 
-		ret = (void *) __get_free_pages(gfp | __GFP_COMP,
-						get_order(size));
+		ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node);
 		if (ret) {
 			struct page *page;
 			page = virt_to_page(ret);
@@ -405,7 +444,7 @@ void *__kmalloc(size_t size, gfp_t gfp)
 		return ret;
 	}
 }
-EXPORT_SYMBOL(__kmalloc);
+EXPORT_SYMBOL(__kmalloc_node);
 
 /**
  * krealloc - reallocate memory. The contents will remain unchanged.
@@ -455,7 +494,6 @@ void kfree(const void *block)
 	} else
 		put_page(&sp->page);
 }
-
 EXPORT_SYMBOL(kfree);
 
 /* can't use ksize for kmem_cache_alloc memory, only kmalloc */
@@ -487,7 +525,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 {
 	struct kmem_cache *c;
 
-	c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+	c = slob_alloc(sizeof(struct kmem_cache), flags, 0, -1);
 
 	if (c) {
 		c->name = name;
@@ -517,21 +555,21 @@ void kmem_cache_destroy(struct kmem_cache *c)
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
-void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 {
 	void *b;
 
 	if (c->size < PAGE_SIZE)
-		b = slob_alloc(c->size, flags, c->align);
+		b = slob_alloc(c->size, flags, c->align, node);
 	else
-		b = (void *)__get_free_pages(flags, get_order(c->size));
+		b = slob_new_page(flags, get_order(c->size), node);
 
 	if (c->ctor)
 		c->ctor(b, c, 0);
 
 	return b;
 }
-EXPORT_SYMBOL(kmem_cache_alloc);
+EXPORT_SYMBOL(kmem_cache_alloc_node);
 
 void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
 {
-- 
cgit v1.3-8-gc7d7


From 0165ab443556bdfad388da6c33d74a71b77d72b2 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Sun, 15 Jul 2007 23:38:26 -0700
Subject: split mmap

This is a straightforward split of do_mmap_pgoff() into two functions:

 - do_mmap_pgoff() checks the parameters, and calculates the vma
   flags.  Then it calls

 - mmap_region(), which does the actual mapping

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  4 ++++
 mm/mmap.c          | 25 +++++++++++++++++++------
 2 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bbd427e8741a..d4ab4e590e23 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1071,6 +1071,10 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo
 extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
 	unsigned long flag, unsigned long pgoff);
+extern unsigned long mmap_region(struct file *file, unsigned long addr,
+	unsigned long len, unsigned long flags,
+	unsigned int vm_flags, unsigned long pgoff,
+	int accountable);
 
 static inline unsigned long do_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
diff --git a/mm/mmap.c b/mm/mmap.c
index 9f70c8e8c871..144b4a290f2c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -894,14 +894,11 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 			unsigned long flags, unsigned long pgoff)
 {
 	struct mm_struct * mm = current->mm;
-	struct vm_area_struct * vma, * prev;
 	struct inode *inode;
 	unsigned int vm_flags;
-	int correct_wcount = 0;
 	int error;
-	struct rb_node ** rb_link, * rb_parent;
 	int accountable = 1;
-	unsigned long charged = 0, reqprot = prot;
+	unsigned long reqprot = prot;
 
 	/*
 	 * Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1027,6 +1024,24 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 	if (error)
 		return error;
 
+	return mmap_region(file, addr, len, flags, vm_flags, pgoff,
+			   accountable);
+}
+EXPORT_SYMBOL(do_mmap_pgoff);
+
+unsigned long mmap_region(struct file *file, unsigned long addr,
+			  unsigned long len, unsigned long flags,
+			  unsigned int vm_flags, unsigned long pgoff,
+			  int accountable)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma, *prev;
+	int correct_wcount = 0;
+	int error;
+	struct rb_node **rb_link, *rb_parent;
+	unsigned long charged = 0;
+	struct inode *inode =  file ? file->f_path.dentry->d_inode : NULL;
+
 	/* Clear old maps */
 	error = -ENOMEM;
 munmap_back:
@@ -1175,8 +1190,6 @@ unacct_error:
 	return error;
 }
 
-EXPORT_SYMBOL(do_mmap_pgoff);
-
 /* Get an address range which is currently unmapped.
  * For shmat() with addr=0.
  *
-- 
cgit v1.3-8-gc7d7


From 786d7e1612f0b0adb6046f19b906609e4fe8b1ba Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@sw.ru>
Date: Sun, 15 Jul 2007 23:39:00 -0700
Subject: Fix rmmod/read/write races in /proc entries

Fix following races:
===========================================
1. Write via ->write_proc sleeps in copy_from_user(). Module disappears
   meanwhile. Or, more generically, system call done on /proc file, method
   supplied by module is called, module dissapeares meanwhile.

   pde = create_proc_entry()
   if (!pde)
	return -ENOMEM;
   pde->write_proc = ...
				open
				write
				copy_from_user
   pde = create_proc_entry();
   if (!pde) {
	remove_proc_entry();
	return -ENOMEM;
	/* module unloaded */
   }
				*boom*
==========================================
2. bogo-revoke aka proc_kill_inodes()

  remove_proc_entry		vfs_read
  proc_kill_inodes		[check ->f_op validness]
				[check ->f_op->read validness]
				[verify_area, security permissions checks]
	->f_op = NULL;
				if (file->f_op->read)
					/* ->f_op dereference, boom */

NOTE, NOTE, NOTE: file_operations are proxied for regular files only. Let's
see how this scheme behaves, then extend if needed for directories.
Directories creators in /proc only set ->owner for them, so proxying for
directories may be unneeded.

NOTE, NOTE, NOTE: methods being proxied are ->llseek, ->read, ->write,
->poll, ->unlocked_ioctl, ->ioctl, ->compat_ioctl, ->open, ->release.
If your in-tree module uses something else, yell on me. Full audit pending.

[akpm@linux-foundation.org: build fix]
Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/generic.c       |  32 +++++-
 fs/proc/inode.c         | 254 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/proc_fs.h |  13 +++
 3 files changed, 296 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8a40e15f5ecb..4f8e53568b22 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -20,6 +20,7 @@
 #include <linux/namei.h>
 #include <linux/bitops.h>
 #include <linux/spinlock.h>
+#include <linux/completion.h>
 #include <asm/uaccess.h>
 
 #include "internal.h"
@@ -613,6 +614,9 @@ static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent,
 	ent->namelen = len;
 	ent->mode = mode;
 	ent->nlink = nlink;
+	ent->pde_users = 0;
+	spin_lock_init(&ent->pde_unload_lock);
+	ent->pde_unload_completion = NULL;
  out:
 	return ent;
 }
@@ -734,9 +738,35 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 		de = *p;
 		*p = de->next;
 		de->next = NULL;
+
+		spin_lock(&de->pde_unload_lock);
+		/*
+		 * Stop accepting new callers into module. If you're
+		 * dynamically allocating ->proc_fops, save a pointer somewhere.
+		 */
+		de->proc_fops = NULL;
+		/* Wait until all existing callers into module are done. */
+		if (de->pde_users > 0) {
+			DECLARE_COMPLETION_ONSTACK(c);
+
+			if (!de->pde_unload_completion)
+				de->pde_unload_completion = &c;
+
+			spin_unlock(&de->pde_unload_lock);
+			spin_unlock(&proc_subdir_lock);
+
+			wait_for_completion(de->pde_unload_completion);
+
+			spin_lock(&proc_subdir_lock);
+			goto continue_removing;
+		}
+		spin_unlock(&de->pde_unload_lock);
+
+continue_removing:
 		if (S_ISDIR(de->mode))
 			parent->nlink--;
-		proc_kill_inodes(de);
+		if (!S_ISREG(de->mode))
+			proc_kill_inodes(de);
 		de->nlink = 0;
 		WARN_ON(de->subdir);
 		if (!atomic_read(&de->count))
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d5ce65c68d7b..dd28e86ab422 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -10,6 +10,7 @@
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/stat.h>
+#include <linux/completion.h>
 #include <linux/file.h>
 #include <linux/limits.h>
 #include <linux/init.h>
@@ -140,6 +141,251 @@ static const struct super_operations proc_sops = {
 	.remount_fs	= proc_remount,
 };
 
+static void pde_users_dec(struct proc_dir_entry *pde)
+{
+	spin_lock(&pde->pde_unload_lock);
+	pde->pde_users--;
+	if (pde->pde_unload_completion && pde->pde_users == 0)
+		complete(pde->pde_unload_completion);
+	spin_unlock(&pde->pde_unload_lock);
+}
+
+static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	loff_t rv = -EINVAL;
+	loff_t (*llseek)(struct file *, loff_t, int);
+
+	spin_lock(&pde->pde_unload_lock);
+	/*
+	 * remove_proc_entry() is going to delete PDE (as part of module
+	 * cleanup sequence). No new callers into module allowed.
+	 */
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	/*
+	 * Bump refcount so that remove_proc_entry will wail for ->llseek to
+	 * complete.
+	 */
+	pde->pde_users++;
+	/*
+	 * Save function pointer under lock, to protect against ->proc_fops
+	 * NULL'ifying right after ->pde_unload_lock is dropped.
+	 */
+	llseek = pde->proc_fops->llseek;
+	spin_unlock(&pde->pde_unload_lock);
+
+	if (!llseek)
+		llseek = default_llseek;
+	rv = llseek(file, offset, whence);
+
+	pde_users_dec(pde);
+	return rv;
+}
+
+static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	ssize_t rv = -EIO;
+	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
+
+	spin_lock(&pde->pde_unload_lock);
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	pde->pde_users++;
+	read = pde->proc_fops->read;
+	spin_unlock(&pde->pde_unload_lock);
+
+	if (read)
+		rv = read(file, buf, count, ppos);
+
+	pde_users_dec(pde);
+	return rv;
+}
+
+static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	ssize_t rv = -EIO;
+	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
+
+	spin_lock(&pde->pde_unload_lock);
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	pde->pde_users++;
+	write = pde->proc_fops->write;
+	spin_unlock(&pde->pde_unload_lock);
+
+	if (write)
+		rv = write(file, buf, count, ppos);
+
+	pde_users_dec(pde);
+	return rv;
+}
+
+static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts)
+{
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	unsigned int rv = 0;
+	unsigned int (*poll)(struct file *, struct poll_table_struct *);
+
+	spin_lock(&pde->pde_unload_lock);
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	pde->pde_users++;
+	poll = pde->proc_fops->poll;
+	spin_unlock(&pde->pde_unload_lock);
+
+	if (poll)
+		rv = poll(file, pts);
+
+	pde_users_dec(pde);
+	return rv;
+}
+
+static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	long rv = -ENOTTY;
+	long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long);
+	int (*ioctl)(struct inode *, struct file *, unsigned int, unsigned long);
+
+	spin_lock(&pde->pde_unload_lock);
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	pde->pde_users++;
+	unlocked_ioctl = pde->proc_fops->unlocked_ioctl;
+	ioctl = pde->proc_fops->ioctl;
+	spin_unlock(&pde->pde_unload_lock);
+
+	if (unlocked_ioctl) {
+		rv = unlocked_ioctl(file, cmd, arg);
+		if (rv == -ENOIOCTLCMD)
+			rv = -EINVAL;
+	} else if (ioctl) {
+		lock_kernel();
+		rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
+		unlock_kernel();
+	}
+
+	pde_users_dec(pde);
+	return rv;
+}
+
+#ifdef CONFIG_COMPAT
+static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	long rv = -ENOTTY;
+	long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
+
+	spin_lock(&pde->pde_unload_lock);
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	pde->pde_users++;
+	compat_ioctl = pde->proc_fops->compat_ioctl;
+	spin_unlock(&pde->pde_unload_lock);
+
+	if (compat_ioctl)
+		rv = compat_ioctl(file, cmd, arg);
+
+	pde_users_dec(pde);
+	return rv;
+}
+#endif
+
+static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	int rv = -EIO;
+	int (*mmap)(struct file *, struct vm_area_struct *);
+
+	spin_lock(&pde->pde_unload_lock);
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	pde->pde_users++;
+	mmap = pde->proc_fops->mmap;
+	spin_unlock(&pde->pde_unload_lock);
+
+	if (mmap)
+		rv = mmap(file, vma);
+
+	pde_users_dec(pde);
+	return rv;
+}
+
+static int proc_reg_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *pde = PDE(inode);
+	int rv = 0;
+	int (*open)(struct inode *, struct file *);
+
+	spin_lock(&pde->pde_unload_lock);
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	pde->pde_users++;
+	open = pde->proc_fops->open;
+	spin_unlock(&pde->pde_unload_lock);
+
+	if (open)
+		rv = open(inode, file);
+
+	pde_users_dec(pde);
+	return rv;
+}
+
+static int proc_reg_release(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *pde = PDE(inode);
+	int rv = 0;
+	int (*release)(struct inode *, struct file *);
+
+	spin_lock(&pde->pde_unload_lock);
+	if (!pde->proc_fops) {
+		spin_unlock(&pde->pde_unload_lock);
+		return rv;
+	}
+	pde->pde_users++;
+	release = pde->proc_fops->release;
+	spin_unlock(&pde->pde_unload_lock);
+
+	if (release)
+		rv = release(inode, file);
+
+	pde_users_dec(pde);
+	return rv;
+}
+
+static const struct file_operations proc_reg_file_ops = {
+	.llseek		= proc_reg_llseek,
+	.read		= proc_reg_read,
+	.write		= proc_reg_write,
+	.poll		= proc_reg_poll,
+	.unlocked_ioctl	= proc_reg_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= proc_reg_compat_ioctl,
+#endif
+	.mmap		= proc_reg_mmap,
+	.open		= proc_reg_open,
+	.release	= proc_reg_release,
+};
+
 struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
 				struct proc_dir_entry *de)
 {
@@ -166,8 +412,12 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
 			inode->i_nlink = de->nlink;
 		if (de->proc_iops)
 			inode->i_op = de->proc_iops;
-		if (de->proc_fops)
-			inode->i_fop = de->proc_fops;
+		if (de->proc_fops) {
+			if (S_ISREG(inode->i_mode))
+				inode->i_fop = &proc_reg_file_ops;
+			else
+				inode->i_fop = de->proc_fops;
+		}
 	}
 
 	return inode;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 3469f96bc8b2..28e3664fdf1b 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -7,6 +7,8 @@
 #include <linux/magic.h>
 #include <asm/atomic.h>
 
+struct completion;
+
 /*
  * The proc filesystem constants/structures
  */
@@ -56,6 +58,14 @@ struct proc_dir_entry {
 	gid_t gid;
 	loff_t size;
 	const struct inode_operations *proc_iops;
+	/*
+	 * NULL ->proc_fops means "PDE is going away RSN" or
+	 * "PDE is just created". In either case, e.g. ->read_proc won't be
+	 * called because it's too late or too early, respectively.
+	 *
+	 * If you're allocating ->proc_fops dynamically, save a pointer
+	 * somewhere.
+	 */
 	const struct file_operations *proc_fops;
 	get_info_t *get_info;
 	struct module *owner;
@@ -66,6 +76,9 @@ struct proc_dir_entry {
 	atomic_t count;		/* use count */
 	int deleted;		/* delete flag */
 	void *set;
+	int pde_users;	/* number of callers into module in progress */
+	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
+	struct completion *pde_unload_completion;
 };
 
 struct kcore_list {
-- 
cgit v1.3-8-gc7d7


From e1f4a88c5a15a86124a95ea712213bb7dab2ad99 Mon Sep 17 00:00:00 2001
From: Satyam Sharma <ssatyam@cse.iitk.ac.in>
Date: Sun, 15 Jul 2007 23:39:24 -0700
Subject: introduce write_trylock_irqsave()

Introduce a write_trylock_irqsave() implementation.  Similar in style to
the implementation of spin_trylock_irqsave() in mainline.

Signed-off-by: Satyam Sharma <ssatyam@cse.iitk.ac.in>
Cc: Sripathi Kodi <sripathik@in.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/spinlock.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index a946176db638..c376f3b36c89 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -282,6 +282,13 @@ do {						\
 	1 : ({ local_irq_restore(flags); 0; }); \
 })
 
+#define write_trylock_irqsave(lock, flags) \
+({ \
+	local_irq_save(flags); \
+	write_trylock(lock) ? \
+	1 : ({ local_irq_restore(flags); 0; }); \
+})
+
 /*
  * Locks two spinlocks l1 and l2.
  * l1_first indicates if spinlock l1 should be taken first.
-- 
cgit v1.3-8-gc7d7


From ed4aaadb1a7913f509f05d3e67840541a180713f Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Sun, 15 Jul 2007 23:39:39 -0700
Subject: fix jvc cdrom drive lockup

Before calling init_hwif_default, ide_unregister gets lock ide_lock and
disables irq.  init_hwif_default calls ide_default_io_base which calls
pci_get_device and later pci_get_subsys tries to apply for semaphore
pci_bus_sem and goes to sleep.

Mostly, pci_get_device should be called when irq is turned on.

ide_default_io_base just needs find if list pci_devices is empty.

Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Cc: Greg KH <greg@kroah.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pci/probe.c    | 12 ++++++++++++
 include/asm-i386/ide.h |  4 +---
 include/linux/pci.h    |  3 +++
 3 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index a7bce75c6732..34b8dae0d90f 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -22,6 +22,18 @@ EXPORT_SYMBOL(pci_root_buses);
 
 LIST_HEAD(pci_devices);
 
+/*
+ * Some device drivers need know if pci is initiated.
+ * Basically, we think pci is not initiated when there
+ * is no device in list of pci_devices.
+ */
+int no_pci_devices(void)
+{
+	return list_empty(&pci_devices);
+}
+
+EXPORT_SYMBOL(no_pci_devices);
+
 #ifdef HAVE_PCI_LEGACY
 /**
  * pci_create_legacy_files - create legacy I/O port and memory files
diff --git a/include/asm-i386/ide.h b/include/asm-i386/ide.h
index 0fc240c80f49..e7817a3d6578 100644
--- a/include/asm-i386/ide.h
+++ b/include/asm-i386/ide.h
@@ -40,14 +40,13 @@ static __inline__ int ide_default_irq(unsigned long base)
 
 static __inline__ unsigned long ide_default_io_base(int index)
 {
-	struct pci_dev *pdev;
 	/*
 	 *	If PCI is present then it is not safe to poke around
 	 *	the other legacy IDE ports. Only 0x1f0 and 0x170 are
 	 *	defined compatibility mode ports for PCI. A user can 
 	 *	override this using ide= but we must default safe.
 	 */
-	if ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL)) == NULL) {
+	if (no_pci_devices()) {
 		switch(index) {
 			case 2: return 0x1e8;
 			case 3: return 0x168;
@@ -55,7 +54,6 @@ static __inline__ unsigned long ide_default_io_base(int index)
 			case 5: return 0x160;
 		}
 	}
-	pci_dev_put(pdev);
 	switch (index) {
 		case 0:	return 0x1f0;
 		case 1:	return 0x170;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 37a71580ad8a..5e84f2e8d54c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -432,6 +432,8 @@ extern struct bus_type pci_bus_type;
  * code, or pci core code. */
 extern struct list_head pci_root_buses;	/* list of all known PCI buses */
 extern struct list_head pci_devices;	/* list of all devices */
+/* Some device drivers need know if pci is initiated */
+extern int no_pci_devices(void);
 
 void pcibios_fixup_bus(struct pci_bus *);
 int __must_check pcibios_enable_device(struct pci_dev *, int mask);
@@ -724,6 +726,7 @@ static inline struct pci_dev *pci_get_class(unsigned int class, struct pci_dev *
 { return NULL; }
 
 #define pci_dev_present(ids)	(0)
+#define no_pci_devices()	(1)
 #define pci_find_present(ids)	(NULL)
 #define pci_dev_put(dev)	do { } while (0)
 
-- 
cgit v1.3-8-gc7d7


From 7c3f1a573237b90ef331267260358a0ec4ac9079 Mon Sep 17 00:00:00 2001
From: Tomas Janousek <tjanouse@redhat.com>
Date: Sun, 15 Jul 2007 23:39:41 -0700
Subject: Introduce boot based time

The commits

  411187fb05cd11676b0979d9fbf3291db69dbce2 (GTOD: persistent clock support)
  c1d370e167d66b10bca3b602d3740405469383de (i386: use GTOD persistent clock
    support)

changed the monotonic time so that it no longer jumps after resume, but it's
not possible to use it for boot time and process start time calculations then.
 Also, the uptime no longer increases during suspend.

I add a variable to track the wall_to_monotonic changes, a function to get the
real boot time and a function to get the boot based time from the monotonic
one.

[akpm@linux-foundation.org: remove exports, add comment]
Signed-off-by: Tomas Janousek <tjanouse@redhat.com>
Cc: Tomas Smetana <tsmetana@redhat.com>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/time.h      |  2 ++
 kernel/time/timekeeping.c | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index dda9be685ab6..4bb05a829be9 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -116,6 +116,8 @@ extern int do_setitimer(int which, struct itimerval *value,
 extern unsigned int alarm_setitimer(unsigned int seconds);
 extern int do_getitimer(int which, struct itimerval *value);
 extern void getnstimeofday(struct timespec *tv);
+extern void getboottime(struct timespec *ts);
+extern void monotonic_to_bootbased(struct timespec *ts);
 
 extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 extern int timekeeping_is_continuous(void);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3d1042f82a68..728cedfd3cbd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -36,9 +36,17 @@ EXPORT_SYMBOL(xtime_lock);
  * at zero at system boot time, so wall_to_monotonic will be negative,
  * however, we will ALWAYS keep the tv_nsec part positive so we can use
  * the usual normalization.
+ *
+ * wall_to_monotonic is moved after resume from suspend for the monotonic
+ * time not to jump. We need to add total_sleep_time to wall_to_monotonic
+ * to get the real boot based time offset.
+ *
+ * - wall_to_monotonic is no longer the boot time, getboottime must be
+ * used instead.
  */
 struct timespec xtime __attribute__ ((aligned (16)));
 struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
+static unsigned long total_sleep_time;		/* seconds */
 
 EXPORT_SYMBOL(xtime);
 
@@ -251,6 +259,7 @@ void __init timekeeping_init(void)
 	xtime.tv_nsec = 0;
 	set_normalized_timespec(&wall_to_monotonic,
 		-xtime.tv_sec, -xtime.tv_nsec);
+	total_sleep_time = 0;
 
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 }
@@ -282,6 +291,7 @@ static int timekeeping_resume(struct sys_device *dev)
 
 		xtime.tv_sec += sleep_length;
 		wall_to_monotonic.tv_sec -= sleep_length;
+		total_sleep_time += sleep_length;
 	}
 	/* re-base the last cycle value */
 	clock->cycle_last = clocksource_read(clock);
@@ -476,3 +486,30 @@ void update_wall_time(void)
 	change_clocksource();
 	update_vsyscall(&xtime, clock);
 }
+
+/**
+ * getboottime - Return the real time of system boot.
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ *
+ * This is based on the wall_to_monotonic offset and the total suspend
+ * time. Calls to settimeofday will affect the value returned (which
+ * basically means that however wrong your real time clock is at boot time,
+ * you get the right time here).
+ */
+void getboottime(struct timespec *ts)
+{
+	set_normalized_timespec(ts,
+		- (wall_to_monotonic.tv_sec + total_sleep_time),
+		- wall_to_monotonic.tv_nsec);
+}
+
+/**
+ * monotonic_to_bootbased - Convert the monotonic time to boot based.
+ * @ts:		pointer to the timespec to be converted
+ */
+void monotonic_to_bootbased(struct timespec *ts)
+{
+	ts->tv_sec += total_sleep_time;
+}
-- 
cgit v1.3-8-gc7d7


From 924b42d5a2dbe508407a0a6290d3751f826bccdd Mon Sep 17 00:00:00 2001
From: Tomas Janousek <tjanouse@redhat.com>
Date: Sun, 15 Jul 2007 23:39:42 -0700
Subject: Use boot based time for process start time and boot time in /proc

Commit 411187fb05cd11676b0979d9fbf3291db69dbce2 caused boot time to move and
process start times to become invalid after suspend.  Using boot based time
for those restores the old behaviour and fixes the issue.

[akpm@linux-foundation.org: little cleanup]
Signed-off-by: Tomas Janousek <tjanouse@redhat.com>
Cc: Tomas Smetana <tsmetana@redhat.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c       | 5 +++--
 fs/proc/proc_misc.c   | 6 +++---
 include/linux/sched.h | 3 ++-
 kernel/fork.c         | 2 ++
 4 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 98e78e2f18d6..680c913575f0 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -440,8 +440,9 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 
 	/* Temporary variable needed for gcc-2.96 */
 	/* convert timespec -> nsec*/
-	start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC
-				+ task->start_time.tv_nsec;
+	start_time =
+		(unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
+				+ task->real_start_time.tv_nsec;
 	/* convert nsec -> ticks */
 	start_time = nsec_to_clock_t(start_time);
 
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5fd49e47f83a..19c9cbf1c320 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -443,12 +443,12 @@ static int show_stat(struct seq_file *p, void *v)
 	unsigned long jif;
 	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
 	u64 sum = 0;
+	struct timespec boottime;
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
-	jif = - wall_to_monotonic.tv_sec;
-	if (wall_to_monotonic.tv_nsec)
-		--jif;
+	getboottime(&boottime);
+	jif = boottime.tv_sec;
 
 	for_each_possible_cpu(i) {
 		int j;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cfb680585ab8..3cffc1204663 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -972,7 +972,8 @@ struct task_struct {
 	unsigned int rt_priority;
 	cputime_t utime, stime;
 	unsigned long nvcsw, nivcsw; /* context switch counts */
-	struct timespec start_time;
+	struct timespec start_time; 		/* monotonic time */
+	struct timespec real_start_time;	/* boot based time */
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index da3a155bba0d..344d693fdc78 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1059,6 +1059,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->lock_depth = -1;		/* -1 = no lock */
 	do_posix_clock_monotonic_gettime(&p->start_time);
+	p->real_start_time = p->start_time;
+	monotonic_to_bootbased(&p->real_start_time);
 	p->security = NULL;
 	p->io_context = NULL;
 	p->io_wait = NULL;
-- 
cgit v1.3-8-gc7d7


From 9c1729db3e6d738f872bcb090212af00473bf666 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Sun, 15 Jul 2007 23:39:43 -0700
Subject: Prevent an O_NDELAY writer from blocking when a tty write is blocked
 by the tty atomic writer mutex

Without this a tty write could block if a previous blocking tty write was
in progress on the same tty and blocked by a line discipline or hardware
event.  Originally found and reported by Dave Johnson.

Signed-off-by: Alan Cox <alan@redhat.com>
Acked-by: Dave Johnson <djohnson+linux-kernel@sw.starentnetworks.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/n_hdlc.c    |  9 +++++----
 drivers/char/n_tty.c     |  3 ++-
 drivers/char/tty_io.c    | 40 ++++++++++++++++++++++++++++------------
 drivers/char/tty_ioctl.c |  4 ++--
 include/linux/tty.h      |  6 ++++++
 5 files changed, 43 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/n_hdlc.c b/drivers/char/n_hdlc.c
index 337a87f86a3b..37f7d3403040 100644
--- a/drivers/char/n_hdlc.c
+++ b/drivers/char/n_hdlc.c
@@ -780,13 +780,14 @@ static unsigned int n_hdlc_tty_poll(struct tty_struct *tty, struct file *filp,
 		poll_wait(filp, &tty->write_wait, wait);
 
 		/* set bits for operations that won't block */
-		if(n_hdlc->rx_buf_list.head)
+		if (n_hdlc->rx_buf_list.head)
 			mask |= POLLIN | POLLRDNORM;	/* readable */
 		if (test_bit(TTY_OTHER_CLOSED, &tty->flags))
 			mask |= POLLHUP;
-		if(tty_hung_up_p(filp))
+		if (tty_hung_up_p(filp))
 			mask |= POLLHUP;
-		if(n_hdlc->tx_free_buf_list.head)
+		if (!tty_is_writelocked(tty) &&
+				n_hdlc->tx_free_buf_list.head)
 			mask |= POLLOUT | POLLWRNORM;	/* writable */
 	}
 	return mask;
@@ -861,7 +862,7 @@ static void n_hdlc_buf_put(struct n_hdlc_buf_list *list,
 	spin_lock_irqsave(&list->spinlock,flags);
 	
 	buf->link=NULL;
-	if(list->tail)
+	if (list->tail)
 		list->tail->link = buf;
 	else
 		list->head = buf;
diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c
index 154f42203b05..371631f4bfb9 100644
--- a/drivers/char/n_tty.c
+++ b/drivers/char/n_tty.c
@@ -1538,7 +1538,8 @@ static unsigned int normal_poll(struct tty_struct * tty, struct file * file, pol
 		else
 			tty->minimum_to_wake = 1;
 	}
-	if (tty->driver->chars_in_buffer(tty) < WAKEUP_CHARS &&
+	if (!tty_is_writelocked(tty) &&
+			tty->driver->chars_in_buffer(tty) < WAKEUP_CHARS &&
 			tty->driver->write_room(tty) > 0)
 		mask |= POLLOUT | POLLWRNORM;
 	return mask;
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index a96f26a63fa2..a37e6330db8a 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -1726,6 +1726,23 @@ static ssize_t tty_read(struct file * file, char __user * buf, size_t count,
 	return i;
 }
 
+void tty_write_unlock(struct tty_struct *tty)
+{
+	mutex_unlock(&tty->atomic_write_lock);
+	wake_up_interruptible(&tty->write_wait);
+}
+
+int tty_write_lock(struct tty_struct *tty, int ndelay)
+{
+	if (!mutex_trylock(&tty->atomic_write_lock)) {
+		if (ndelay)
+			return -EAGAIN;
+		if (mutex_lock_interruptible(&tty->atomic_write_lock))
+			return -ERESTARTSYS;
+	}
+	return 0;
+}
+
 /*
  * Split writes up in sane blocksizes to avoid
  * denial-of-service type attacks
@@ -1737,13 +1754,12 @@ static inline ssize_t do_tty_write(
 	const char __user *buf,
 	size_t count)
 {
-	ssize_t ret = 0, written = 0;
+	ssize_t ret, written = 0;
 	unsigned int chunk;
 	
-	/* FIXME: O_NDELAY ... */
-	if (mutex_lock_interruptible(&tty->atomic_write_lock)) {
-		return -ERESTARTSYS;
-	}
+	ret = tty_write_lock(tty, file->f_flags & O_NDELAY);
+	if (ret < 0)
+		return ret;
 
 	/*
 	 * We chunk up writes into a temporary buffer. This
@@ -1776,8 +1792,8 @@ static inline ssize_t do_tty_write(
 
 		buf = kmalloc(chunk, GFP_KERNEL);
 		if (!buf) {
-			mutex_unlock(&tty->atomic_write_lock);
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto out;
 		}
 		kfree(tty->write_buf);
 		tty->write_cnt = chunk;
@@ -1812,7 +1828,8 @@ static inline ssize_t do_tty_write(
 		inode->i_mtime = current_fs_time(inode->i_sb);
 		ret = written;
 	}
-	mutex_unlock(&tty->atomic_write_lock);
+out:
+	tty_write_unlock(tty);
 	return ret;
 }
 
@@ -3163,14 +3180,13 @@ static int tiocsetd(struct tty_struct *tty, int __user *p)
 
 static int send_break(struct tty_struct *tty, unsigned int duration)
 {
-	if (mutex_lock_interruptible(&tty->atomic_write_lock))
+	if (tty_write_lock(tty, 0) < 0)
 		return -EINTR;
 	tty->driver->break_ctl(tty, -1);
-	if (!signal_pending(current)) {
+	if (!signal_pending(current))
 		msleep_interruptible(duration);
-	}
 	tty->driver->break_ctl(tty, 0);
-	mutex_unlock(&tty->atomic_write_lock);
+	tty_write_unlock(tty);
 	if (signal_pending(current))
 		return -EINTR;
 	return 0;
diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c
index fd471cb3338f..918e24c885f1 100644
--- a/drivers/char/tty_ioctl.c
+++ b/drivers/char/tty_ioctl.c
@@ -667,7 +667,7 @@ static int send_prio_char(struct tty_struct *tty, char ch)
 		return 0;
 	}
 
-	if (mutex_lock_interruptible(&tty->atomic_write_lock))
+	if (tty_write_lock(tty, 0) < 0)
 		return -ERESTARTSYS;
 
 	if (was_stopped)
@@ -675,7 +675,7 @@ static int send_prio_char(struct tty_struct *tty, char ch)
 	tty->driver->write(tty, &ch, 1);
 	if (was_stopped)
 		stop_tty(tty);
-	mutex_unlock(&tty->atomic_write_lock);
+	tty_write_unlock(tty);
 	return 0;
 }
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index bb4576085203..deaba9ec5004 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -338,6 +338,12 @@ extern struct tty_struct *get_current_tty(void);
 
 extern struct mutex tty_mutex;
 
+extern void tty_write_unlock(struct tty_struct *tty);
+extern int tty_write_lock(struct tty_struct *tty, int ndelay);
+#define tty_is_writelocked(tty)  (mutex_is_locked(&tty->atomic_write_lock))
+
+
+
 /* n_tty.c */
 extern struct tty_ldisc tty_ldisc_N_TTY;
 
-- 
cgit v1.3-8-gc7d7


From 9ac162521cd9796f44d263a61090634844c719a6 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <matthias.kaehlcke@gmail.com>
Date: Sun, 15 Jul 2007 23:39:49 -0700
Subject: Use mutexes instead of semaphores in I2O driver

The I2O driver uses two semaphores as mutexes.  Use the mutex API instead of
the (binary) semaphores.

Signed-off-by: Matthias Kaehlcke <matthias.kaehlcke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/message/i2o/device.c   | 16 ++++++++--------
 drivers/message/i2o/exec-osm.c |  4 ++--
 drivers/message/i2o/iop.c      |  2 +-
 include/linux/i2o.h            |  5 +++--
 4 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
index 611adc3c0f74..489d7c5c4965 100644
--- a/drivers/message/i2o/device.c
+++ b/drivers/message/i2o/device.c
@@ -62,7 +62,7 @@ int i2o_device_claim(struct i2o_device *dev)
 {
 	int rc = 0;
 
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 
 	rc = i2o_device_issue_claim(dev, I2O_CMD_UTIL_CLAIM, I2O_CLAIM_PRIMARY);
 	if (!rc)
@@ -72,7 +72,7 @@ int i2o_device_claim(struct i2o_device *dev)
 		pr_debug("i2o: claim of device %d failed %d\n",
 			 dev->lct_data.tid, rc);
 
-	up(&dev->lock);
+	mutex_unlock(&dev->lock);
 
 	return rc;
 }
@@ -96,7 +96,7 @@ int i2o_device_claim_release(struct i2o_device *dev)
 	int tries;
 	int rc = 0;
 
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 
 	/*
 	 *      If the controller takes a nonblocking approach to
@@ -118,7 +118,7 @@ int i2o_device_claim_release(struct i2o_device *dev)
 		pr_debug("i2o: claim release of device %d failed %d\n",
 			 dev->lct_data.tid, rc);
 
-	up(&dev->lock);
+	mutex_unlock(&dev->lock);
 
 	return rc;
 }
@@ -198,7 +198,7 @@ static struct i2o_device *i2o_device_alloc(void)
 		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&dev->list);
-	init_MUTEX(&dev->lock);
+	mutex_init(&dev->lock);
 
 	dev->device.bus = &i2o_bus_type;
 	dev->device.release = &i2o_device_release;
@@ -326,7 +326,7 @@ int i2o_device_parse_lct(struct i2o_controller *c)
 	u16 table_size;
 	u32 buf;
 
-	down(&c->lct_lock);
+	mutex_lock(&c->lct_lock);
 
 	kfree(c->lct);
 
@@ -335,7 +335,7 @@ int i2o_device_parse_lct(struct i2o_controller *c)
 
 	lct = c->lct = kmalloc(table_size * 4, GFP_KERNEL);
 	if (!lct) {
-		up(&c->lct_lock);
+		mutex_unlock(&c->lct_lock);
 		return -ENOMEM;
 	}
 
@@ -408,7 +408,7 @@ int i2o_device_parse_lct(struct i2o_controller *c)
 			i2o_device_remove(dev);
 	}
 
-	up(&c->lct_lock);
+	mutex_unlock(&c->lct_lock);
 
 	return 0;
 }
diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
index 5278aad92bc4..c13b9321e7ab 100644
--- a/drivers/message/i2o/exec-osm.c
+++ b/drivers/message/i2o/exec-osm.c
@@ -537,7 +537,7 @@ static int i2o_exec_lct_notify(struct i2o_controller *c, u32 change_ind)
 	struct device *dev;
 	struct i2o_message *msg;
 
-	down(&c->lct_lock);
+	mutex_lock(&c->lct_lock);
 
 	dev = &c->pdev->dev;
 
@@ -561,7 +561,7 @@ static int i2o_exec_lct_notify(struct i2o_controller *c, u32 change_ind)
 
 	i2o_msg_post(c, msg);
 
-	up(&c->lct_lock);
+	mutex_unlock(&c->lct_lock);
 
 	return 0;
 };
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
index 3305c12372a2..a1ec16a075c6 100644
--- a/drivers/message/i2o/iop.c
+++ b/drivers/message/i2o/iop.c
@@ -1067,7 +1067,7 @@ struct i2o_controller *i2o_iop_alloc(void)
 
 	INIT_LIST_HEAD(&c->devices);
 	spin_lock_init(&c->lock);
-	init_MUTEX(&c->lct_lock);
+	mutex_init(&c->lct_lock);
 
 	device_initialize(&c->device);
 
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 52f53e2e70c3..333a370a3bdc 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -31,6 +31,7 @@
 #include <linux/slab.h>
 #include <linux/workqueue.h>	/* work_struct */
 #include <linux/mempool.h>
+#include <linux/mutex.h>
 
 #include <asm/io.h>
 #include <asm/semaphore.h>	/* Needed for MUTEX init macros */
@@ -425,7 +426,7 @@ struct i2o_device {
 
 	struct device device;
 
-	struct semaphore lock;	/* device lock */
+	struct mutex lock;	/* device lock */
 };
 
 /*
@@ -544,7 +545,7 @@ struct i2o_controller {
 	struct i2o_dma hrt;	/* HW Resource Table */
 	i2o_lct *lct;		/* Logical Config Table */
 	struct i2o_dma dlct;	/* Temp LCT */
-	struct semaphore lct_lock;	/* Lock for LCT updates */
+	struct mutex lct_lock;	/* Lock for LCT updates */
 	struct i2o_dma status_block;	/* IOP status block */
 
 	struct i2o_io base;	/* controller messaging unit */
-- 
cgit v1.3-8-gc7d7


From 21f3da95daed2d0f0c28cc4ef8b1103fbfb7bded Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Sun, 15 Jul 2007 23:39:50 -0700
Subject: fuse warning fix

gcc-4.3:

fs/fuse/dir.c: In function 'parse_dirfile':
fs/fuse/dir.c:833: warning: cast from pointer to integer of different size
fs/fuse/dir.c:835: warning: cast from pointer to integer of different size

[miklos@szeredi.hu: use offsetof]
Acked-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fuse.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 534744efe30d..9fbe9d258e22 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -339,7 +339,7 @@ struct fuse_dirent {
 	char name[0];
 };
 
-#define FUSE_NAME_OFFSET ((unsigned) ((struct fuse_dirent *) 0)->name)
+#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name)
 #define FUSE_DIRENT_ALIGN(x) (((x) + sizeof(__u64) - 1) & ~(sizeof(__u64) - 1))
 #define FUSE_DIRENT_SIZE(d) \
 	FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen)
-- 
cgit v1.3-8-gc7d7


From c67ad917cbf21b2862e2cf8e8b28339872ef7927 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Sun, 15 Jul 2007 23:39:51 -0700
Subject: percpu_counters(): use cpu notifiers

per-cpu counters presently must iterate over all possible CPUs in the
exhaustive percpu_counter_sum().

But it can be much better to only iterate over the presently-online CPUs.  To
do this, we must arrange for an offlined CPU's count to be spilled into the
counter's central count.

We can do this for all percpu_counters in the machine by linking them into a
single global list and walking that list at CPU_DEAD time.

(I hope.  Might have race windows in which the percpu_counter_sum() count is
inaccurate?)

Cc: Gautham R Shenoy <ego@in.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/percpu_counter.h | 18 ++++--------
 lib/percpu_counter.c           | 66 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index f5aa593ccf32..3d9f70972cdf 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -8,6 +8,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/smp.h>
+#include <linux/list.h>
 #include <linux/threads.h>
 #include <linux/percpu.h>
 #include <linux/types.h>
@@ -17,6 +18,9 @@
 struct percpu_counter {
 	spinlock_t lock;
 	s64 count;
+#ifdef CONFIG_HOTPLUG_CPU
+	struct list_head list;	/* All percpu_counters are on a list */
+#endif
 	s32 *counters;
 };
 
@@ -26,18 +30,8 @@ struct percpu_counter {
 #define FBC_BATCH	(NR_CPUS*4)
 #endif
 
-static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
-{
-	spin_lock_init(&fbc->lock);
-	fbc->count = amount;
-	fbc->counters = alloc_percpu(s32);
-}
-
-static inline void percpu_counter_destroy(struct percpu_counter *fbc)
-{
-	free_percpu(fbc->counters);
-}
-
+void percpu_counter_init(struct percpu_counter *fbc, s64 amount);
+void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_mod(struct percpu_counter *fbc, s32 amount);
 s64 percpu_counter_sum(struct percpu_counter *fbc);
 
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 850449080e1c..8901c4e9c2e6 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -3,8 +3,17 @@
  */
 
 #include <linux/percpu_counter.h>
+#include <linux/notifier.h>
+#include <linux/mutex.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
 #include <linux/module.h>
 
+#ifdef CONFIG_HOTPLUG_CPU
+static LIST_HEAD(percpu_counters);
+static DEFINE_MUTEX(percpu_counters_lock);
+#endif
+
 void percpu_counter_mod(struct percpu_counter *fbc, s32 amount)
 {
 	long count;
@@ -44,3 +53,60 @@ s64 percpu_counter_sum(struct percpu_counter *fbc)
 	return ret < 0 ? 0 : ret;
 }
 EXPORT_SYMBOL(percpu_counter_sum);
+
+void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
+{
+	spin_lock_init(&fbc->lock);
+	fbc->count = amount;
+	fbc->counters = alloc_percpu(s32);
+#ifdef CONFIG_HOTPLUG_CPU
+	mutex_lock(&percpu_counters_lock);
+	list_add(&fbc->list, &percpu_counters);
+	mutex_unlock(&percpu_counters_lock);
+#endif
+}
+EXPORT_SYMBOL(percpu_counter_init);
+
+void percpu_counter_destroy(struct percpu_counter *fbc)
+{
+	free_percpu(fbc->counters);
+#ifdef CONFIG_HOTPLUG_CPU
+	mutex_lock(&percpu_counters_lock);
+	list_del(&fbc->list);
+	mutex_unlock(&percpu_counters_lock);
+#endif
+}
+EXPORT_SYMBOL(percpu_counter_destroy);
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb,
+					unsigned long action, void *hcpu)
+{
+	unsigned int cpu;
+	struct percpu_counter *fbc;
+
+	if (action != CPU_DEAD)
+		return NOTIFY_OK;
+
+	cpu = (unsigned long)hcpu;
+	mutex_lock(&percpu_counters_lock);
+	list_for_each_entry(fbc, &percpu_counters, list) {
+		s32 *pcount;
+
+		spin_lock(&fbc->lock);
+		pcount = per_cpu_ptr(fbc->counters, cpu);
+		fbc->count += *pcount;
+		*pcount = 0;
+		spin_unlock(&fbc->lock);
+	}
+	mutex_unlock(&percpu_counters_lock);
+	return NOTIFY_OK;
+}
+
+static int __init percpu_counter_startup(void)
+{
+	hotcpu_notifier(percpu_counter_hotcpu_callback, 0);
+	return 0;
+}
+module_init(percpu_counter_startup);
+#endif
-- 
cgit v1.3-8-gc7d7


From 9aacd599342fdfc1fb9422f37e900609b7a46249 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 15 Jul 2007 23:39:56 -0700
Subject: fat: gcc 4.3 warning fix

This patch fixes the following warnings.

fs/fat/dir.c: In function 'fat_parse_long':
include/linux/msdos_fs.h:294: warning: array subscript is above array bounds
include/linux/msdos_fs.h:295: warning: array subscript is above array bounds
include/linux/msdos_fs.h:295: warning: array subscript is above array bounds

The ->name is defined as "name[8], ext[3]", but fat_checksum() uses
those as name[11]. There is no actual problem, but it's not a good manner.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c             | 31 ++++++++++++++++++-------------
 fs/fat/inode.c           |  3 +--
 include/linux/msdos_fs.h |  2 +-
 3 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index ccf161dffb63..72cbcd61bd95 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -313,7 +313,7 @@ int fat_search_long(struct inode *inode, const unsigned char *name,
 	wchar_t bufuname[14];
 	unsigned char xlate_len, nr_slots;
 	wchar_t *unicode = NULL;
-	unsigned char work[8], bufname[260];	/* 256 + 4 */
+	unsigned char work[MSDOS_NAME], bufname[260];	/* 256 + 4 */
 	int uni_xlate = sbi->options.unicode_xlate;
 	int utf8 = sbi->options.utf8;
 	int anycase = (sbi->options.name_check != 's');
@@ -351,7 +351,8 @@ parse_record:
 		if (work[0] == 0x05)
 			work[0] = 0xE5;
 		for (i = 0, j = 0, last_u = 0; i < 8;) {
-			if (!work[i]) break;
+			if (!work[i])
+				break;
 			chl = fat_shortname2uni(nls_disk, &work[i], 8 - i,
 						&bufuname[j++], opt_shortname,
 						de->lcase & CASE_LOWER_BASE);
@@ -365,13 +366,15 @@ parse_record:
 		}
 		j = last_u;
 		fat_short2uni(nls_disk, ".", 1, &bufuname[j++]);
-		for (i = 0; i < 3;) {
-			if (!de->ext[i]) break;
-			chl = fat_shortname2uni(nls_disk, &de->ext[i], 3 - i,
+		for (i = 8; i < MSDOS_NAME;) {
+			if (!work[i])
+				break;
+			chl = fat_shortname2uni(nls_disk, &work[i],
+						MSDOS_NAME - i,
 						&bufuname[j++], opt_shortname,
 						de->lcase & CASE_LOWER_EXT);
 			if (chl <= 1) {
-				if (de->ext[i] != ' ')
+				if (work[i] != ' ')
 					last_u = j;
 			} else {
 				last_u = j;
@@ -445,7 +448,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
 	int fill_len;
 	wchar_t bufuname[14];
 	wchar_t *unicode = NULL;
-	unsigned char c, work[8], bufname[56], *ptname = bufname;
+	unsigned char c, work[MSDOS_NAME], bufname[56], *ptname = bufname;
 	unsigned long lpos, dummy, *furrfu = &lpos;
 	int uni_xlate = sbi->options.unicode_xlate;
 	int isvfat = sbi->options.isvfat;
@@ -527,7 +530,8 @@ parse_record:
 	if (work[0] == 0x05)
 		work[0] = 0xE5;
 	for (i = 0, j = 0, last = 0, last_u = 0; i < 8;) {
-		if (!(c = work[i])) break;
+		if (!(c = work[i]))
+			break;
 		chl = fat_shortname2uni(nls_disk, &work[i], 8 - i,
 					&bufuname[j++], opt_shortname,
 					de->lcase & CASE_LOWER_BASE);
@@ -549,9 +553,10 @@ parse_record:
 	j = last_u;
 	fat_short2uni(nls_disk, ".", 1, &bufuname[j++]);
 	ptname[i++] = '.';
-	for (i2 = 0; i2 < 3;) {
-		if (!(c = de->ext[i2])) break;
-		chl = fat_shortname2uni(nls_disk, &de->ext[i2], 3 - i2,
+	for (i2 = 8; i2 < MSDOS_NAME;) {
+		if (!(c = work[i2]))
+			break;
+		chl = fat_shortname2uni(nls_disk, &work[i2], MSDOS_NAME - i2,
 					&bufuname[j++], opt_shortname,
 					de->lcase & CASE_LOWER_EXT);
 		if (chl <= 1) {
@@ -563,8 +568,8 @@ parse_record:
 			}
 		} else {
 			last_u = j;
-			for (chi = 0; chi < chl && i2 < 3; chi++) {
-				ptname[i++] = de->ext[i2++];
+			for (chi = 0; chi < chl && i2 < MSDOS_NAME; chi++) {
+				ptname[i++] = work[i2++];
 				last = i;
 			}
 		}
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 479722d89667..cfaf5877d98b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -354,8 +354,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 	} else { /* not a directory */
 		inode->i_generation |= 1;
 		inode->i_mode = MSDOS_MKMODE(de->attr,
-		    ((sbi->options.showexec &&
-			!is_exec(de->ext))
+		    ((sbi->options.showexec && !is_exec(de->name + 8))
 			? S_IRUGO|S_IWUGO : S_IRWXUGO)
 		    & ~sbi->options.fs_fmask) | S_IFREG;
 		MSDOS_I(inode)->i_start = le16_to_cpu(de->start);
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index 0e09c005dda8..f950921523f5 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -146,7 +146,7 @@ struct fat_boot_fsinfo {
 };
 
 struct msdos_dir_entry {
-	__u8	name[8],ext[3];	/* name and extension */
+	__u8	name[MSDOS_NAME];/* name and extension */
 	__u8	attr;		/* attribute bits */
 	__u8    lcase;		/* Case for base and extension */
 	__u8	ctime_cs;	/* Creation time, centiseconds (0-199) */
-- 
cgit v1.3-8-gc7d7


From 0a3021f4e249fbdb5f30d614707b5e02022e4c9b Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@mindspring.com>
Date: Sun, 15 Jul 2007 23:39:57 -0700
Subject: Remove unnecessary includes of spinlock.h under include/linux

Remove the obviously unnecessary includes of <linux/spinlock.h> under the
include/linux/ directory, and fix the couple errors that are introduced as
a result of that.

Signed-off-by: Robert P. J. Day <rpjday@mindspring.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/attribute_container.h | 1 -
 include/linux/capability.h          | 1 -
 include/linux/console.h             | 1 -
 include/linux/ds17287rtc.h          | 1 -
 include/linux/ipc.h                 | 1 +
 include/linux/leds.h                | 1 -
 include/linux/module.h              | 1 -
 include/linux/percpu.h              | 2 +-
 include/linux/scx200_gpio.h         | 2 --
 include/linux/signal.h              | 1 -
 include/linux/smp_lock.h            | 1 -
 include/linux/timer.h               | 1 -
 12 files changed, 2 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/attribute_container.h b/include/linux/attribute_container.h
index 93bfb0beb62a..8ff274933948 100644
--- a/include/linux/attribute_container.h
+++ b/include/linux/attribute_container.h
@@ -12,7 +12,6 @@
 #include <linux/device.h>
 #include <linux/list.h>
 #include <linux/klist.h>
-#include <linux/spinlock.h>
 
 struct attribute_container {
 	struct list_head	node;
diff --git a/include/linux/capability.h b/include/linux/capability.h
index bbf8df7de28f..2dfa58555934 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -44,7 +44,6 @@ typedef struct __user_cap_data_struct {
   
 #ifdef __KERNEL__
 
-#include <linux/spinlock.h>
 #include <asm/current.h>
 
 /* #define STRICT_CAP_T_TYPECHECKS */
diff --git a/include/linux/console.h b/include/linux/console.h
index c44d3dfde7a5..56a7bcda49cb 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -15,7 +15,6 @@
 #define _LINUX_CONSOLE_H_ 1
 
 #include <linux/types.h>
-#include <linux/spinlock.h>
 
 struct vc_data;
 struct console_font_op;
diff --git a/include/linux/ds17287rtc.h b/include/linux/ds17287rtc.h
index c281ba42e28f..d85d3f497b96 100644
--- a/include/linux/ds17287rtc.h
+++ b/include/linux/ds17287rtc.h
@@ -11,7 +11,6 @@
 #define __LINUX_DS17287RTC_H
 
 #include <linux/rtc.h>			/* get the user-level API */
-#include <linux/spinlock.h>		/* spinlock_t */
 #include <linux/mc146818rtc.h>
 
 /* Register A */
diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index 1980867a64a4..7c8c6d8d090c 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -52,6 +52,7 @@ struct ipc_perm
 #ifdef __KERNEL__
 
 #include <linux/kref.h>
+#include <linux/spinlock.h>
 
 #define IPCMNI 32768  /* <= MAX_INT limit for ipc arrays (including sysctl changes) */
 
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 88afceffb7cb..494bed7c2fc1 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -13,7 +13,6 @@
 #define __LINUX_LEDS_H_INCLUDED
 
 #include <linux/list.h>
-#include <linux/spinlock.h>
 
 struct device;
 struct class_device;
diff --git a/include/linux/module.h b/include/linux/module.h
index e6e0f86ef5fc..b6a646cea1cb 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -6,7 +6,6 @@
  * Rewritten by Richard Henderson <rth@tamu.edu> Dec 1996
  * Rewritten again by Rusty Russell, 2002
  */
-#include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/stat.h>
 #include <linux/compiler.h>
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index b72be2f79e6a..926adaae0f96 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -1,7 +1,7 @@
 #ifndef __LINUX_PERCPU_H
 #define __LINUX_PERCPU_H
 
-#include <linux/spinlock.h> /* For preempt_disable() */
+#include <linux/preempt.h>
 #include <linux/slab.h> /* For kmalloc() */
 #include <linux/smp.h>
 #include <linux/string.h> /* For memset() */
diff --git a/include/linux/scx200_gpio.h b/include/linux/scx200_gpio.h
index 1a82d30c4b17..d2b058130eb1 100644
--- a/include/linux/scx200_gpio.h
+++ b/include/linux/scx200_gpio.h
@@ -1,5 +1,3 @@
-#include <linux/spinlock.h>
-
 u32 scx200_gpio_configure(unsigned index, u32 set, u32 clear);
 
 extern unsigned scx200_gpio_base;
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 9a5eac508e5e..ea91abe740da 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -6,7 +6,6 @@
 
 #ifdef __KERNEL__
 #include <linux/list.h>
-#include <linux/spinlock.h>
 
 /*
  * Real Time signals may be queued.
diff --git a/include/linux/smp_lock.h b/include/linux/smp_lock.h
index cf715a40d833..58962c51dee1 100644
--- a/include/linux/smp_lock.h
+++ b/include/linux/smp_lock.h
@@ -3,7 +3,6 @@
 
 #ifdef CONFIG_LOCK_KERNEL
 #include <linux/sched.h>
-#include <linux/spinlock.h>
 
 #define kernel_locked()		(current->lock_depth >= 0)
 
diff --git a/include/linux/timer.h b/include/linux/timer.h
index c661710d3627..2b59e6d4219c 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -3,7 +3,6 @@
 
 #include <linux/list.h>
 #include <linux/ktime.h>
-#include <linux/spinlock.h>
 #include <linux/stddef.h>
 
 struct tvec_t_base_s;
-- 
cgit v1.3-8-gc7d7


From e8d6c554126b830217c5e9f549e0e21f865a0a8a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 15 Jul 2007 23:40:12 -0700
Subject: AFS: implement file locking

Implement file locking for AFS.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/Makefile    |   1 +
 fs/afs/afs.h       |   8 +
 fs/afs/afs_fs.h    |   3 +
 fs/afs/callback.c  |   3 +
 fs/afs/dir.c       |   1 +
 fs/afs/file.c      |   2 +
 fs/afs/flock.c     | 558 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/afs/fsclient.c  | 155 ++++++++++++++-
 fs/afs/internal.h  |  30 +++
 fs/afs/main.c      |   1 +
 fs/afs/misc.c      |   1 +
 fs/afs/super.c     |   3 +
 fs/afs/vnode.c     | 130 +++++++++++--
 include/linux/fs.h |   4 +
 14 files changed, 885 insertions(+), 15 deletions(-)
 create mode 100644 fs/afs/flock.c

(limited to 'include/linux')

diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 73ce561f3ea0..a66671082cfb 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -8,6 +8,7 @@ kafs-objs := \
 	cmservice.o \
 	dir.o \
 	file.o \
+	flock.o \
 	fsclient.o \
 	inode.o \
 	main.o \
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index 245257948140..c548aa346f0d 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -37,6 +37,13 @@ typedef enum {
 	AFS_FTYPE_SYMLINK	= 3,
 } afs_file_type_t;
 
+typedef enum {
+	AFS_LOCK_READ		= 0,	/* read lock request */
+	AFS_LOCK_WRITE		= 1,	/* write lock request */
+} afs_lock_type_t;
+
+#define AFS_LOCKWAIT		(5 * 60) /* time until a lock times out (seconds) */
+
 /*
  * AFS file identifier
  */
@@ -120,6 +127,7 @@ struct afs_file_status {
 	struct afs_fid		parent;		/* parent dir ID for non-dirs only */
 	time_t			mtime_client;	/* last time client changed data */
 	time_t			mtime_server;	/* last time server changed data */
+	s32			lock_count;	/* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */
 };
 
 /*
diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h
index a18c374ebe08..eb647323d8f0 100644
--- a/fs/afs/afs_fs.h
+++ b/fs/afs/afs_fs.h
@@ -31,6 +31,9 @@ enum AFS_FS_Operations {
 	FSGETVOLUMEINFO		= 148,	/* AFS Get information about a volume */
 	FSGETVOLUMESTATUS	= 149,	/* AFS Get volume status information */
 	FSGETROOTVOLUME		= 151,	/* AFS Get root volume name */
+	FSSETLOCK		= 156,	/* AFS Request a file lock */
+	FSEXTENDLOCK		= 157,	/* AFS Extend a file lock */
+	FSRELEASELOCK		= 158,	/* AFS Release a file lock */
 	FSLOOKUP		= 161,	/* AFS lookup file in directory */
 	FSFETCHDATA64		= 65537, /* AFS Fetch file data */
 	FSSTOREDATA64		= 65538, /* AFS Store file data */
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index bacf518c6fa8..b8243945818d 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -125,6 +125,9 @@ static void afs_break_callback(struct afs_server *server,
 		spin_unlock(&server->cb_lock);
 
 		queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
+		if (list_empty(&vnode->granted_locks) &&
+		    !list_empty(&vnode->pending_locks))
+			afs_lock_may_be_available(vnode);
 		spin_unlock(&vnode->lock);
 	}
 }
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 546c59522eb1..33fe39ad4e03 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -44,6 +44,7 @@ const struct file_operations afs_dir_file_operations = {
 	.open		= afs_dir_open,
 	.release	= afs_release,
 	.readdir	= afs_readdir,
+	.lock		= afs_lock,
 };
 
 const struct inode_operations afs_dir_inode_operations = {
diff --git a/fs/afs/file.c b/fs/afs/file.c
index aede7eb66dd4..525f7c56e068 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -34,6 +34,8 @@ const struct file_operations afs_file_operations = {
 	.mmap		= generic_file_readonly_mmap,
 	.splice_read	= generic_file_splice_read,
 	.fsync		= afs_fsync,
+	.lock		= afs_lock,
+	.flock		= afs_flock,
 };
 
 const struct inode_operations afs_file_inode_operations = {
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
new file mode 100644
index 000000000000..8f07f8d1bfa9
--- /dev/null
+++ b/fs/afs/flock.c
@@ -0,0 +1,558 @@
+/* AFS file locking support
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/smp_lock.h>
+#include "internal.h"
+
+#define AFS_LOCK_GRANTED	0
+#define AFS_LOCK_PENDING	1
+
+static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl);
+static void afs_fl_release_private(struct file_lock *fl);
+
+static struct workqueue_struct *afs_lock_manager;
+
+static struct file_lock_operations afs_lock_ops = {
+	.fl_copy_lock		= afs_fl_copy_lock,
+	.fl_release_private	= afs_fl_release_private,
+};
+
+/*
+ * initialise the lock manager thread if it isn't already running
+ */
+static int afs_init_lock_manager(void)
+{
+	if (!afs_lock_manager) {
+		afs_lock_manager = create_singlethread_workqueue("kafs_lockd");
+		if (!afs_lock_manager)
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+/*
+ * destroy the lock manager thread if it's running
+ */
+void __exit afs_kill_lock_manager(void)
+{
+	if (afs_lock_manager)
+		destroy_workqueue(afs_lock_manager);
+}
+
+/*
+ * if the callback is broken on this vnode, then the lock may now be available
+ */
+void afs_lock_may_be_available(struct afs_vnode *vnode)
+{
+	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+
+	queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0);
+}
+
+/*
+ * the lock will time out in 5 minutes unless we extend it, so schedule
+ * extension in a bit less than that time
+ */
+static void afs_schedule_lock_extension(struct afs_vnode *vnode)
+{
+	queue_delayed_work(afs_lock_manager, &vnode->lock_work,
+			   AFS_LOCKWAIT * HZ / 2);
+}
+
+/*
+ * do work for a lock, including:
+ * - probing for a lock we're waiting on but didn't get immediately
+ * - extending a lock that's close to timing out
+ */
+void afs_lock_work(struct work_struct *work)
+{
+	struct afs_vnode *vnode =
+		container_of(work, struct afs_vnode, lock_work.work);
+	struct file_lock *fl;
+	afs_lock_type_t type;
+	struct key *key;
+	int ret;
+
+	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+
+	spin_lock(&vnode->lock);
+
+	if (test_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) {
+		_debug("unlock");
+		spin_unlock(&vnode->lock);
+
+		/* attempt to release the server lock; if it fails, we just
+		 * wait 5 minutes and it'll time out anyway */
+		ret = afs_vnode_release_lock(vnode, vnode->unlock_key);
+		if (ret < 0)
+			printk(KERN_WARNING "AFS:"
+			       " Failed to release lock on {%x:%x} error %d\n",
+			       vnode->fid.vid, vnode->fid.vnode, ret);
+
+		spin_lock(&vnode->lock);
+		key_put(vnode->unlock_key);
+		vnode->unlock_key = NULL;
+		clear_bit(AFS_VNODE_UNLOCKING, &vnode->flags);
+	}
+
+	/* if we've got a lock, then it must be time to extend that lock as AFS
+	 * locks time out after 5 minutes */
+	if (!list_empty(&vnode->granted_locks)) {
+		_debug("extend");
+
+		if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags))
+			BUG();
+		fl = list_entry(vnode->granted_locks.next,
+				struct file_lock, fl_u.afs.link);
+		key = key_get(fl->fl_file->private_data);
+		spin_unlock(&vnode->lock);
+
+		ret = afs_vnode_extend_lock(vnode, key);
+		clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
+		key_put(key);
+		switch (ret) {
+		case 0:
+			afs_schedule_lock_extension(vnode);
+			break;
+		default:
+			/* ummm... we failed to extend the lock - retry
+			 * extension shortly */
+			printk(KERN_WARNING "AFS:"
+			       " Failed to extend lock on {%x:%x} error %d\n",
+			       vnode->fid.vid, vnode->fid.vnode, ret);
+			queue_delayed_work(afs_lock_manager, &vnode->lock_work,
+					   HZ * 10);
+			break;
+		}
+		_leave(" [extend]");
+		return;
+	}
+
+	/* if we don't have a granted lock, then we must've been called back by
+	 * the server, and so if might be possible to get a lock we're
+	 * currently waiting for */
+	if (!list_empty(&vnode->pending_locks)) {
+		_debug("get");
+
+		if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags))
+			BUG();
+		fl = list_entry(vnode->pending_locks.next,
+				struct file_lock, fl_u.afs.link);
+		key = key_get(fl->fl_file->private_data);
+		type = (fl->fl_type == F_RDLCK) ?
+			AFS_LOCK_READ : AFS_LOCK_WRITE;
+		spin_unlock(&vnode->lock);
+
+		ret = afs_vnode_set_lock(vnode, key, type);
+		clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
+		switch (ret) {
+		case -EWOULDBLOCK:
+			_debug("blocked");
+			break;
+		case 0:
+			_debug("acquired");
+			if (type == AFS_LOCK_READ)
+				set_bit(AFS_VNODE_READLOCKED, &vnode->flags);
+			else
+				set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
+			ret = AFS_LOCK_GRANTED;
+		default:
+			spin_lock(&vnode->lock);
+			/* the pending lock may have been withdrawn due to a
+			 * signal */
+			if (list_entry(vnode->pending_locks.next,
+				       struct file_lock, fl_u.afs.link) == fl) {
+				fl->fl_u.afs.state = ret;
+				if (ret == AFS_LOCK_GRANTED)
+					list_move_tail(&fl->fl_u.afs.link,
+						       &vnode->granted_locks);
+				else
+					list_del_init(&fl->fl_u.afs.link);
+				wake_up(&fl->fl_wait);
+				spin_unlock(&vnode->lock);
+			} else {
+				_debug("withdrawn");
+				clear_bit(AFS_VNODE_READLOCKED, &vnode->flags);
+				clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
+				spin_unlock(&vnode->lock);
+				afs_vnode_release_lock(vnode, key);
+				if (!list_empty(&vnode->pending_locks))
+					afs_lock_may_be_available(vnode);
+			}
+			break;
+		}
+		key_put(key);
+		_leave(" [pend]");
+		return;
+	}
+
+	/* looks like the lock request was withdrawn on a signal */
+	spin_unlock(&vnode->lock);
+	_leave(" [no locks]");
+}
+
+/*
+ * pass responsibility for the unlocking of a vnode on the server to the
+ * manager thread, lest a pending signal in the calling thread interrupt
+ * AF_RXRPC
+ * - the caller must hold the vnode lock
+ */
+static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key)
+{
+	cancel_delayed_work(&vnode->lock_work);
+	if (!test_and_clear_bit(AFS_VNODE_READLOCKED, &vnode->flags) &&
+	    !test_and_clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags))
+		BUG();
+	if (test_and_set_bit(AFS_VNODE_UNLOCKING, &vnode->flags))
+		BUG();
+	vnode->unlock_key = key_get(key);
+	afs_lock_may_be_available(vnode);
+}
+
+/*
+ * request a lock on a file on the server
+ */
+static int afs_do_setlk(struct file *file, struct file_lock *fl)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
+	afs_lock_type_t type;
+	struct key *key = file->private_data;
+	int ret;
+
+	_enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+
+	/* only whole-file locks are supported */
+	if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
+		return -EINVAL;
+
+	ret = afs_init_lock_manager();
+	if (ret < 0)
+		return ret;
+
+	fl->fl_ops = &afs_lock_ops;
+	INIT_LIST_HEAD(&fl->fl_u.afs.link);
+	fl->fl_u.afs.state = AFS_LOCK_PENDING;
+
+	type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
+
+	lock_kernel();
+
+	/* make sure we've got a callback on this file and that our view of the
+	 * data version is up to date */
+	ret = afs_vnode_fetch_status(vnode, NULL, key);
+	if (ret < 0)
+		goto error;
+
+	if (vnode->status.lock_count != 0 && !(fl->fl_flags & FL_SLEEP)) {
+		ret = -EAGAIN;
+		goto error;
+	}
+
+	spin_lock(&vnode->lock);
+
+	if (list_empty(&vnode->pending_locks)) {
+		/* if there's no-one else with a lock on this vnode, then we
+		 * need to ask the server for a lock */
+		if (list_empty(&vnode->granted_locks)) {
+			_debug("not locked");
+			ASSERTCMP(vnode->flags &
+				  ((1 << AFS_VNODE_LOCKING) |
+				   (1 << AFS_VNODE_READLOCKED) |
+				   (1 << AFS_VNODE_WRITELOCKED)), ==, 0);
+			list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
+			set_bit(AFS_VNODE_LOCKING, &vnode->flags);
+			spin_unlock(&vnode->lock);
+
+			ret = afs_vnode_set_lock(vnode, key, type);
+			clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
+			switch (ret) {
+			case 0:
+				goto acquired_server_lock;
+			case -EWOULDBLOCK:
+				spin_lock(&vnode->lock);
+				ASSERT(list_empty(&vnode->granted_locks));
+				ASSERTCMP(vnode->pending_locks.next, ==,
+					  &fl->fl_u.afs.link);
+				goto wait;
+			default:
+				spin_lock(&vnode->lock);
+				list_del_init(&fl->fl_u.afs.link);
+				spin_unlock(&vnode->lock);
+				goto error;
+			}
+		}
+
+		/* if we've already got a readlock on the server and no waiting
+		 * writelocks, then we might be able to instantly grant another
+		 * readlock */
+		if (type == AFS_LOCK_READ &&
+		    vnode->flags & (1 << AFS_VNODE_READLOCKED)) {
+			_debug("instant readlock");
+			ASSERTCMP(vnode->flags &
+				  ((1 << AFS_VNODE_LOCKING) |
+				   (1 << AFS_VNODE_WRITELOCKED)), ==, 0);
+			ASSERT(!list_empty(&vnode->granted_locks));
+			goto sharing_existing_lock;
+		}
+	}
+
+	/* otherwise, we need to wait for a local lock to become available */
+	_debug("wait local");
+	list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
+wait:
+	if (!(fl->fl_flags & FL_SLEEP)) {
+		_debug("noblock");
+		ret = -EAGAIN;
+		goto abort_attempt;
+	}
+	spin_unlock(&vnode->lock);
+
+	/* now we need to sleep and wait for the lock manager thread to get the
+	 * lock from the server */
+	_debug("sleep");
+	ret = wait_event_interruptible(fl->fl_wait,
+				       fl->fl_u.afs.state <= AFS_LOCK_GRANTED);
+	if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) {
+		ret = fl->fl_u.afs.state;
+		if (ret < 0)
+			goto error;
+		spin_lock(&vnode->lock);
+		goto given_lock;
+	}
+
+	/* we were interrupted, but someone may still be in the throes of
+	 * giving us the lock */
+	_debug("intr");
+	ASSERTCMP(ret, ==, -ERESTARTSYS);
+
+	spin_lock(&vnode->lock);
+	if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) {
+		ret = fl->fl_u.afs.state;
+		if (ret < 0) {
+			spin_unlock(&vnode->lock);
+			goto error;
+		}
+		goto given_lock;
+	}
+
+abort_attempt:
+	/* we aren't going to get the lock, either because we're unwilling to
+	 * wait, or because some signal happened */
+	_debug("abort");
+	if (list_empty(&vnode->granted_locks) &&
+	    vnode->pending_locks.next == &fl->fl_u.afs.link) {
+		if (vnode->pending_locks.prev != &fl->fl_u.afs.link) {
+			/* kick the next pending lock into having a go */
+			list_del_init(&fl->fl_u.afs.link);
+			afs_lock_may_be_available(vnode);
+		}
+	} else {
+		list_del_init(&fl->fl_u.afs.link);
+	}
+	spin_unlock(&vnode->lock);
+	goto error;
+
+acquired_server_lock:
+	/* we've acquired a server lock, but it needs to be renewed after 5
+	 * mins */
+	spin_lock(&vnode->lock);
+	afs_schedule_lock_extension(vnode);
+	if (type == AFS_LOCK_READ)
+		set_bit(AFS_VNODE_READLOCKED, &vnode->flags);
+	else
+		set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
+sharing_existing_lock:
+	/* the lock has been granted as far as we're concerned... */
+	fl->fl_u.afs.state = AFS_LOCK_GRANTED;
+	list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks);
+given_lock:
+	/* ... but we do still need to get the VFS's blessing */
+	ASSERT(!(vnode->flags & (1 << AFS_VNODE_LOCKING)));
+	ASSERT((vnode->flags & ((1 << AFS_VNODE_READLOCKED) |
+				(1 << AFS_VNODE_WRITELOCKED))) != 0);
+	ret = posix_lock_file(file, fl, NULL);
+	if (ret < 0)
+		goto vfs_rejected_lock;
+	spin_unlock(&vnode->lock);
+
+	/* again, make sure we've got a callback on this file and, again, make
+	 * sure that our view of the data version is up to date (we ignore
+	 * errors incurred here and deal with the consequences elsewhere) */
+	afs_vnode_fetch_status(vnode, NULL, key);
+
+error:
+	unlock_kernel();
+	_leave(" = %d", ret);
+	return ret;
+
+vfs_rejected_lock:
+	/* the VFS rejected the lock we just obtained, so we have to discard
+	 * what we just got */
+	_debug("vfs refused %d", ret);
+	list_del_init(&fl->fl_u.afs.link);
+	if (list_empty(&vnode->granted_locks))
+		afs_defer_unlock(vnode, key);
+	spin_unlock(&vnode->lock);
+	goto abort_attempt;
+}
+
+/*
+ * unlock on a file on the server
+ */
+static int afs_do_unlk(struct file *file, struct file_lock *fl)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
+	struct key *key = file->private_data;
+	int ret;
+
+	_enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+
+	/* only whole-file unlocks are supported */
+	if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
+		return -EINVAL;
+
+	fl->fl_ops = &afs_lock_ops;
+	INIT_LIST_HEAD(&fl->fl_u.afs.link);
+	fl->fl_u.afs.state = AFS_LOCK_PENDING;
+
+	spin_lock(&vnode->lock);
+	ret = posix_lock_file(file, fl, NULL);
+	if (ret < 0) {
+		spin_unlock(&vnode->lock);
+		_leave(" = %d [vfs]", ret);
+		return ret;
+	}
+
+	/* discard the server lock only if all granted locks are gone */
+	if (list_empty(&vnode->granted_locks))
+		afs_defer_unlock(vnode, key);
+	spin_unlock(&vnode->lock);
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * return information about a lock we currently hold, if indeed we hold one
+ */
+static int afs_do_getlk(struct file *file, struct file_lock *fl)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
+	struct key *key = file->private_data;
+	int ret, lock_count;
+
+	_enter("");
+
+	fl->fl_type = F_UNLCK;
+
+	mutex_lock(&vnode->vfs_inode.i_mutex);
+
+	/* check local lock records first */
+	ret = 0;
+	if (posix_test_lock(file, fl) == 0) {
+		/* no local locks; consult the server */
+		ret = afs_vnode_fetch_status(vnode, NULL, key);
+		if (ret < 0)
+			goto error;
+		lock_count = vnode->status.lock_count;
+		if (lock_count) {
+			if (lock_count > 0)
+				fl->fl_type = F_RDLCK;
+			else
+				fl->fl_type = F_WRLCK;
+			fl->fl_start = 0;
+			fl->fl_end = OFFSET_MAX;
+		}
+	}
+
+error:
+	mutex_unlock(&vnode->vfs_inode.i_mutex);
+	_leave(" = %d [%hd]", ret, fl->fl_type);
+	return ret;
+}
+
+/*
+ * manage POSIX locks on a file
+ */
+int afs_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+
+	_enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
+	       vnode->fid.vid, vnode->fid.vnode, cmd,
+	       fl->fl_type, fl->fl_flags,
+	       (long long) fl->fl_start, (long long) fl->fl_end);
+
+	/* AFS doesn't support mandatory locks */
+	if ((vnode->vfs_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
+	    fl->fl_type != F_UNLCK)
+		return -ENOLCK;
+
+	if (IS_GETLK(cmd))
+		return afs_do_getlk(file, fl);
+	if (fl->fl_type == F_UNLCK)
+		return afs_do_unlk(file, fl);
+	return afs_do_setlk(file, fl);
+}
+
+/*
+ * manage FLOCK locks on a file
+ */
+int afs_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+
+	_enter("{%x:%u},%d,{t=%x,fl=%x}",
+	       vnode->fid.vid, vnode->fid.vnode, cmd,
+	       fl->fl_type, fl->fl_flags);
+
+	/*
+	 * No BSD flocks over NFS allowed.
+	 * Note: we could try to fake a POSIX lock request here by
+	 * using ((u32) filp | 0x80000000) or some such as the pid.
+	 * Not sure whether that would be unique, though, or whether
+	 * that would break in other places.
+	 */
+	if (!(fl->fl_flags & FL_FLOCK))
+		return -ENOLCK;
+
+	/* we're simulating flock() locks using posix locks on the server */
+	fl->fl_owner = (fl_owner_t) file;
+	fl->fl_start = 0;
+	fl->fl_end = OFFSET_MAX;
+
+	if (fl->fl_type == F_UNLCK)
+		return afs_do_unlk(file, fl);
+	return afs_do_setlk(file, fl);
+}
+
+/*
+ * the POSIX lock management core VFS code copies the lock record and adds the
+ * copy into its own list, so we need to add that copy to the vnode's lock
+ * queue in the same place as the original (which will be deleted shortly
+ * after)
+ */
+static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl)
+{
+	_enter("");
+
+	list_add(&new->fl_u.afs.link, &fl->fl_u.afs.link);
+}
+
+/*
+ * need to remove this lock from the vnode queue when it's removed from the
+ * VFS's list
+ */
+static void afs_fl_release_private(struct file_lock *fl)
+{
+	_enter("");
+
+	list_del_init(&fl->fl_u.afs.link);
+}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 5dff1308b6f0..023b95b0d9d7 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -67,7 +67,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 	EXTRACT(status->group);
 	bp++; /* sync counter */
 	data_version |= (u64) ntohl(*bp++) << 32;
-	bp++; /* lock count */
+	EXTRACT(status->lock_count);
 	size |= (u64) ntohl(*bp++) << 32;
 	bp++; /* spare 4 */
 	*_bp = bp;
@@ -1748,3 +1748,156 @@ int afs_fs_get_volume_status(struct afs_server *server,
 
 	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
 }
+
+/*
+ * deliver reply data to an FS.SetLock, FS.ExtendLock or FS.ReleaseLock
+ */
+static int afs_deliver_fs_xxxx_lock(struct afs_call *call,
+				    struct sk_buff *skb, bool last)
+{
+	const __be32 *bp;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.SetLock operation type
+ */
+static const struct afs_call_type afs_RXFSSetLock = {
+	.name		= "FS.SetLock",
+	.deliver	= afs_deliver_fs_xxxx_lock,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * FS.ExtendLock operation type
+ */
+static const struct afs_call_type afs_RXFSExtendLock = {
+	.name		= "FS.ExtendLock",
+	.deliver	= afs_deliver_fs_xxxx_lock,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * FS.ReleaseLock operation type
+ */
+static const struct afs_call_type afs_RXFSReleaseLock = {
+	.name		= "FS.ReleaseLock",
+	.deliver	= afs_deliver_fs_xxxx_lock,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * get a lock on a file
+ */
+int afs_fs_set_lock(struct afs_server *server,
+		    struct key *key,
+		    struct afs_vnode *vnode,
+		    afs_lock_type_t type,
+		    const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(&afs_RXFSSetLock, 5 * 4, 6 * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSETLOCK);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+	*bp++ = htonl(type);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * extend a lock on a file
+ */
+int afs_fs_extend_lock(struct afs_server *server,
+		       struct key *key,
+		       struct afs_vnode *vnode,
+		       const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(&afs_RXFSExtendLock, 4 * 4, 6 * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSEXTENDLOCK);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * release a lock on a file
+ */
+int afs_fs_release_lock(struct afs_server *server,
+			struct key *key,
+			struct afs_vnode *vnode,
+			const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(&afs_RXFSReleaseLock, 4 * 4, 6 * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSRELEASELOCK);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 2c55dd94a1de..6306438f331f 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -351,10 +351,18 @@ struct afs_vnode {
 #define AFS_VNODE_ZAP_DATA	3		/* set if vnode's data should be invalidated */
 #define AFS_VNODE_DELETED	4		/* set if vnode deleted on server */
 #define AFS_VNODE_MOUNTPOINT	5		/* set if vnode is a mountpoint symlink */
+#define AFS_VNODE_LOCKING	6		/* set if waiting for lock on vnode */
+#define AFS_VNODE_READLOCKED	7		/* set if vnode is read-locked on the server */
+#define AFS_VNODE_WRITELOCKED	8		/* set if vnode is write-locked on the server */
+#define AFS_VNODE_UNLOCKING	9		/* set if vnode is being unlocked on the server */
 
 	long			acl_order;	/* ACL check count (callback break count) */
 
 	struct list_head	writebacks;	/* alterations in pagecache that need writing */
+	struct list_head	pending_locks;	/* locks waiting to be granted */
+	struct list_head	granted_locks;	/* locks granted on this file */
+	struct delayed_work	lock_work;	/* work to be done in locking */
+	struct key		*unlock_key;	/* key to be used in unlocking */
 
 	/* outstanding callback notification on this file */
 	struct rb_node		server_rb;	/* link in server->fs_vnodes */
@@ -473,6 +481,15 @@ extern const struct file_operations afs_file_operations;
 extern int afs_open(struct inode *, struct file *);
 extern int afs_release(struct inode *, struct file *);
 
+/*
+ * flock.c
+ */
+extern void __exit afs_kill_lock_manager(void);
+extern void afs_lock_work(struct work_struct *);
+extern void afs_lock_may_be_available(struct afs_vnode *);
+extern int afs_lock(struct file *, int, struct file_lock *);
+extern int afs_flock(struct file *, int, struct file_lock *);
+
 /*
  * fsclient.c
  */
@@ -513,6 +530,15 @@ extern int afs_fs_get_volume_status(struct afs_server *, struct key *,
 				    struct afs_vnode *,
 				    struct afs_volume_status *,
 				    const struct afs_wait_mode *);
+extern int afs_fs_set_lock(struct afs_server *, struct key *,
+			   struct afs_vnode *, afs_lock_type_t,
+			   const struct afs_wait_mode *);
+extern int afs_fs_extend_lock(struct afs_server *, struct key *,
+			      struct afs_vnode *,
+			      const struct afs_wait_mode *);
+extern int afs_fs_release_lock(struct afs_server *, struct key *,
+			       struct afs_vnode *,
+			       const struct afs_wait_mode *);
 
 /*
  * inode.c
@@ -681,6 +707,10 @@ extern int afs_vnode_store_data(struct afs_writeback *, pgoff_t, pgoff_t,
 extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *);
 extern int afs_vnode_get_volume_status(struct afs_vnode *, struct key *,
 				       struct afs_volume_status *);
+extern int afs_vnode_set_lock(struct afs_vnode *, struct key *,
+			      afs_lock_type_t);
+extern int afs_vnode_extend_lock(struct afs_vnode *, struct key *);
+extern int afs_vnode_release_lock(struct afs_vnode *, struct key *);
 
 /*
  * volume.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index cd21195bbb24..0f60f6b35769 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -168,6 +168,7 @@ static void __exit afs_exit(void)
 	printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n");
 
 	afs_fs_exit();
+	afs_kill_lock_manager();
 	afs_close_socket();
 	afs_purge_servers();
 	afs_callback_update_kill();
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index d1a889c40742..2d33a5f7d218 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -35,6 +35,7 @@ int afs_abort_to_error(u32 abort_code)
 	case VOVERQUOTA:	return -EDQUOT;
 	case VBUSY:		return -EBUSY;
 	case VMOVED:		return -ENXIO;
+	case 0x2f6df0a:		return -EWOULDBLOCK;
 	case 0x2f6df0c:		return -EACCES;
 	case 0x2f6df0f:		return -EBUSY;
 	case 0x2f6df10:		return -EEXIST;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 2e8496ba1205..993cdf1cce3a 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -460,6 +460,9 @@ static void afs_i_init_once(void *_vnode, struct kmem_cache *cachep,
 	spin_lock_init(&vnode->writeback_lock);
 	spin_lock_init(&vnode->lock);
 	INIT_LIST_HEAD(&vnode->writebacks);
+	INIT_LIST_HEAD(&vnode->pending_locks);
+	INIT_LIST_HEAD(&vnode->granted_locks);
+	INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work);
 	INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work);
 }
 
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 232c55dc245d..ac811e445b2d 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -887,11 +887,6 @@ int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key,
 	       vnode->fid.unique,
 	       key_serial(key));
 
-	/* this op will fetch the status */
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-
 	do {
 		/* pick a server to query */
 		server = afs_volume_pick_fileserver(vnode);
@@ -905,20 +900,127 @@ int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key,
 	} while (!afs_volume_release_fileserver(vnode, server, ret));
 
 	/* adjust the flags */
-	if (ret == 0) {
-		afs_vnode_finalise_status_update(vnode, server);
+	if (ret == 0)
+		afs_put_server(server);
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	return PTR_ERR(server);
+}
+
+/*
+ * get a lock on a file
+ */
+int afs_vnode_set_lock(struct afs_vnode *vnode, struct key *key,
+		       afs_lock_type_t type)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,%u",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key), type);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_set_lock(server, key, vnode, type, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0)
+		afs_put_server(server);
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	return PTR_ERR(server);
+}
+
+/*
+ * extend a lock on a file
+ */
+int afs_vnode_extend_lock(struct afs_vnode *vnode, struct key *key)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_extend_lock(server, key, vnode, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0)
+		afs_put_server(server);
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	return PTR_ERR(server);
+}
+
+/*
+ * release a lock on a file
+ */
+int afs_vnode_release_lock(struct afs_vnode *vnode, struct key *key)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_release_lock(server, key, vnode, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0)
 		afs_put_server(server);
-	} else {
-		afs_vnode_status_update_failed(vnode, ret);
-	}
 
 	_leave(" = %d", ret);
 	return ret;
 
 no_server:
-	spin_lock(&vnode->lock);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
 	return PTR_ERR(server);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 51c938a71dec..aa4530c1ff7a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -820,6 +820,10 @@ struct file_lock {
 	union {
 		struct nfs_lock_info	nfs_fl;
 		struct nfs4_lock_info	nfs4_fl;
+		struct {
+			struct list_head link;	/* link in AFS vnode's pending_locks list */
+			int state;		/* state of grant or error if -ve */
+		} afs;
 	} fl_u;
 };
 
-- 
cgit v1.3-8-gc7d7


From 9e7bf24b1b979db256ddc84d0d4ac6040d706da6 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sun, 15 Jul 2007 23:40:25 -0700
Subject: fs: clarify "dummy" member in struct inodes_stat_t

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index aa4530c1ff7a..e68780810279 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -44,7 +44,7 @@ extern int get_max_files(void);
 struct inodes_stat_t {
 	int nr_inodes;
 	int nr_unused;
-	int dummy[5];
+	int dummy[5];		/* padding for sysctl ABI compatibility */
 };
 extern struct inodes_stat_t inodes_stat;
 
-- 
cgit v1.3-8-gc7d7


From 1b0fac45878bb88759eec347c273285195649ff7 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sun, 15 Jul 2007 23:40:26 -0700
Subject: dma-mapping: prevent dma dependent code from linking on !HAS_DMA
 archs

Continuing the work started in 411f0f3edc141a582190d3605cadd1d993abb6df ...

This enables code with a dma path, that compiles away, to build without
requiring additional code factoring.  It also prevents code that calls
dma_alloc_coherent and dma_free_coherent from linking whereas previously
the code would hit a BUG() at run time.  Finally, it allows archs that set
!HAS_DMA to delete their asm/dma-mapping.h file.

Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: John W. Linville <linville@tuxdriver.com>
Cc: Kyle McMartin <kyle@parisc-linux.org>
Cc: James Bottomley <James.Bottomley@SteelEye.com>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: <geert@linux-m68k.org>
Cc: <zippel@linux-m68k.org>
Cc: <spyro@f2s.com>
Cc: <ysato@users.sourceforge.jp>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm26/Kconfig                       |  3 ++
 arch/h8300/Kconfig                       |  3 ++
 arch/m32r/Kconfig                        |  3 ++
 drivers/dma/Kconfig                      |  2 +-
 include/asm-arm26/dma-mapping.h          |  2 -
 include/asm-generic/dma-mapping-broken.h | 82 +++++++++++++++++++++++++++-----
 include/asm-h8300/dma-mapping.h          |  1 -
 include/asm-m32r/dma-mapping.h           |  6 ---
 include/asm-s390/dma-mapping.h           | 12 -----
 include/linux/dma-mapping.h              |  4 ++
 10 files changed, 84 insertions(+), 34 deletions(-)
 delete mode 100644 include/asm-arm26/dma-mapping.h
 delete mode 100644 include/asm-h8300/dma-mapping.h
 delete mode 100644 include/asm-m32r/dma-mapping.h
 delete mode 100644 include/asm-s390/dma-mapping.h

(limited to 'include/linux')

diff --git a/arch/arm26/Kconfig b/arch/arm26/Kconfig
index 20688bc13e9b..9044f33299f7 100644
--- a/arch/arm26/Kconfig
+++ b/arch/arm26/Kconfig
@@ -17,6 +17,9 @@ config MMU
 	bool
 	default y
 
+config NO_DMA
+	def_bool y
+
 config ARCH_ACORN
         bool
         default y
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 618dbad696f6..e35f74e6e505 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -68,6 +68,9 @@ config TIME_LOW_RES
 config NO_IOPORT
 	def_bool y
 
+config NO_DMA
+	def_bool y
+
 config ISA
 	bool
 	default y
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index c3bb8a755b00..8ccf3e47bff8 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -31,6 +31,9 @@ config GENERIC_IRQ_PROBE
 config NO_IOPORT
 	def_bool y
 
+config NO_DMA
+	def_bool y
+
 source "init/Kconfig"
 
 
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index b31756d59978..8f670dae53bb 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -3,7 +3,7 @@
 #
 
 menu "DMA Engine support"
-	depends on !S390
+	depends on HAS_DMA
 
 config DMA_ENGINE
 	bool "Support for DMA engines"
diff --git a/include/asm-arm26/dma-mapping.h b/include/asm-arm26/dma-mapping.h
deleted file mode 100644
index a95eae0aeb77..000000000000
--- a/include/asm-arm26/dma-mapping.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#include <asm-generic/dma-mapping-broken.h>
-
diff --git a/include/asm-generic/dma-mapping-broken.h b/include/asm-generic/dma-mapping-broken.h
index 29413d3d4605..e2468f894d2a 100644
--- a/include/asm-generic/dma-mapping-broken.h
+++ b/include/asm-generic/dma-mapping-broken.h
@@ -1,24 +1,82 @@
 #ifndef _ASM_GENERIC_DMA_MAPPING_H
 #define _ASM_GENERIC_DMA_MAPPING_H
 
-/* This is used for archs that do not support DMA */
+/* define the dma api to allow compilation but not linking of
+ * dma dependent code.  Code that depends on the dma-mapping
+ * API needs to set 'depends on HAS_DMA' in its Kconfig
+ */
 
-static inline void *
+struct scatterlist;
+
+extern void *
 dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		   gfp_t flag)
-{
-	BUG();
-	return NULL;
-}
+		   gfp_t flag);
 
-static inline void
+extern void
 dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
-		    dma_addr_t dma_handle)
-{
-	BUG();
-}
+		    dma_addr_t dma_handle);
 
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
 
+extern dma_addr_t
+dma_map_single(struct device *dev, void *ptr, size_t size,
+	       enum dma_data_direction direction);
+
+extern void
+dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
+		 enum dma_data_direction direction);
+
+extern int
+dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+	   enum dma_data_direction direction);
+
+extern void
+dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries,
+	     enum dma_data_direction direction);
+
+extern dma_addr_t
+dma_map_page(struct device *dev, struct page *page, unsigned long offset,
+	     size_t size, enum dma_data_direction direction);
+
+extern void
+dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
+	       enum dma_data_direction direction);
+
+extern void
+dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
+			enum dma_data_direction direction);
+
+extern void
+dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
+			      unsigned long offset, size_t size,
+			      enum dma_data_direction direction);
+
+extern void
+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
+		    enum dma_data_direction direction);
+
+#define dma_sync_single_for_device dma_sync_single_for_cpu
+#define dma_sync_single_range_for_device dma_sync_single_range_for_cpu
+#define dma_sync_sg_for_device dma_sync_sg_for_cpu
+
+extern int
+dma_mapping_error(dma_addr_t dma_addr);
+
+extern int
+dma_supported(struct device *dev, u64 mask);
+
+extern int
+dma_set_mask(struct device *dev, u64 mask);
+
+extern int
+dma_get_cache_alignment(void);
+
+extern int
+dma_is_consistent(struct device *dev, dma_addr_t dma_handle);
+
+extern void
+dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+	       enum dma_data_direction direction);
+
 #endif /* _ASM_GENERIC_DMA_MAPPING_H */
diff --git a/include/asm-h8300/dma-mapping.h b/include/asm-h8300/dma-mapping.h
deleted file mode 100644
index d00e40099165..000000000000
--- a/include/asm-h8300/dma-mapping.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/dma-mapping-broken.h>
diff --git a/include/asm-m32r/dma-mapping.h b/include/asm-m32r/dma-mapping.h
deleted file mode 100644
index f9b58ebba361..000000000000
--- a/include/asm-m32r/dma-mapping.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_M32R_DMA_MAPPING_H
-#define _ASM_M32R_DMA_MAPPING_H
-
-#include <asm-generic/dma-mapping-broken.h>
-
-#endif /* _ASM_M32R_DMA_MAPPING_H */
diff --git a/include/asm-s390/dma-mapping.h b/include/asm-s390/dma-mapping.h
deleted file mode 100644
index 3f8c12fde0f0..000000000000
--- a/include/asm-s390/dma-mapping.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- *  include/asm-s390/dma-mapping.h
- *
- *  S390 version
- *
- *  This file exists so that #include <dma-mapping.h> doesn't break anything.
- */
-
-#ifndef _ASM_DMA_MAPPING_H
-#define _ASM_DMA_MAPPING_H
-
-#endif /* _ASM_DMA_MAPPING_H */
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 9a663c6db16a..2dc21cbeb304 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -31,7 +31,11 @@ static inline int valid_dma_direction(int dma_direction)
 		(dma_direction == DMA_FROM_DEVICE));
 }
 
+#ifdef CONFIG_HAS_DMA
 #include <asm/dma-mapping.h>
+#else
+#include <asm-generic/dma-mapping-broken.h>
+#endif
 
 /* Backwards compat, remove in 2.7.x */
 #define dma_sync_single		dma_sync_single_for_cpu
-- 
cgit v1.3-8-gc7d7


From e0807061908a7a9441d0f745deb444f7216904cb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 15 Jul 2007 23:40:30 -0700
Subject: remove odd and misleading comments from uio.h

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/uio.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 9af8bbcd8963..b7fe13883bdb 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -13,10 +13,6 @@
  *		2 of the License, or (at your option) any later version.
  */
 
-
-/* A word of warning: Our uio structure will clash with the C library one (which is now obsolete). Remove the C
-   library one from sys/uio.h if you have a very old library set */
-
 struct iovec
 {
 	void __user *iov_base;	/* BSD uses caddr_t (1003.1g requires void *) */
@@ -38,11 +34,6 @@ struct kvec {
  
 #define UIO_FASTIOV	8
 #define UIO_MAXIOV	1024
-#if 0
-#define UIO_MAXIOV	16	/* Maximum iovec's in one operation 
-				   16 matches BSD */
-                                /* Beg pardon: BSD has 1024 --ANK */
-#endif
 
 /*
  * Total number of bytes covered by an iovec.
-- 
cgit v1.3-8-gc7d7


From c5c061b8f9726bc2c25e19dec227933a13d1e6b7 Mon Sep 17 00:00:00 2001
From: Venki Pallipadi <venkatesh.pallipadi@intel.com>
Date: Sun, 15 Jul 2007 23:40:30 -0700
Subject: Add a flag to indicate deferrable timers in /proc/timer_stats

Add a flag in /proc/timer_stats to indicate deferrable timers.  This will
let developers/users to differentiate between types of tiemrs in
/proc/timer_stats.

Deferrable timer and normal timer will appear in /proc/timer_stats as below.
  10D,     1 swapper          queue_delayed_work_on (delayed_work_timer_fn)
   10,     1 swapper          queue_delayed_work_on (delayed_work_timer_fn)

Also version of timer_stats changes from v0.1 to v0.2

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/hrtimer/timer_stats.txt |  4 ++++
 include/linux/hrtimer.h               |  5 +++--
 include/linux/timer.h                 | 15 ++++-----------
 kernel/time/timer_stats.c             | 14 +++++++++++---
 kernel/timer.c                        | 14 ++++++++++++++
 5 files changed, 36 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/hrtimer/timer_stats.txt b/Documentation/hrtimer/timer_stats.txt
index 22b0814d0ad0..20d368c59814 100644
--- a/Documentation/hrtimer/timer_stats.txt
+++ b/Documentation/hrtimer/timer_stats.txt
@@ -67,3 +67,7 @@ executed on expiry.
 
     Thomas, Ingo
 
+Added flag to indicate 'deferrable timer' in /proc/timer_stats. A deferrable
+timer will appear as follows
+  10D,     1 swapper          queue_delayed_work_on (delayed_work_timer_fn)
+
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 17c29dca8354..540799bc85f8 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -329,12 +329,13 @@ extern void sysrq_timer_list_show(void);
 #ifdef CONFIG_TIMER_STATS
 
 extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
-				     void *timerf, char * comm);
+				     void *timerf, char *comm,
+				     unsigned int timer_flag);
 
 static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
 {
 	timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
-				 timer->function, timer->start_comm);
+				 timer->function, timer->start_comm, 0);
 }
 
 extern void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer,
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 2b59e6d4219c..78cf899b4409 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -90,16 +90,13 @@ extern unsigned long get_next_timer_interrupt(unsigned long now);
  */
 #ifdef CONFIG_TIMER_STATS
 
+#define TIMER_STATS_FLAG_DEFERRABLE	0x1
+
 extern void init_timer_stats(void);
 
 extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
-				     void *timerf, char * comm);
-
-static inline void timer_stats_account_timer(struct timer_list *timer)
-{
-	timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
-				 timer->function, timer->start_comm);
-}
+				     void *timerf, char *comm,
+				     unsigned int timer_flag);
 
 extern void __timer_stats_timer_set_start_info(struct timer_list *timer,
 					       void *addr);
@@ -118,10 +115,6 @@ static inline void init_timer_stats(void)
 {
 }
 
-static inline void timer_stats_account_timer(struct timer_list *timer)
-{
-}
-
 static inline void timer_stats_timer_set_start_info(struct timer_list *timer)
 {
 }
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 321693724ad7..9b8a826236dd 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -68,6 +68,7 @@ struct entry {
 	 * Number of timeout events:
 	 */
 	unsigned long		count;
+	unsigned int		timer_flag;
 
 	/*
 	 * We save the command-line string to preserve
@@ -231,7 +232,8 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
  * incremented. Otherwise the timer is registered in a free slot.
  */
 void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
-			      void *timerf, char * comm)
+			      void *timerf, char *comm,
+			      unsigned int timer_flag)
 {
 	/*
 	 * It doesnt matter which lock we take:
@@ -249,6 +251,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
 	input.start_func = startf;
 	input.expire_func = timerf;
 	input.pid = pid;
+	input.timer_flag = timer_flag;
 
 	spin_lock_irqsave(lock, flags);
 	if (!active)
@@ -295,7 +298,7 @@ static int tstats_show(struct seq_file *m, void *v)
 	period = ktime_to_timespec(time);
 	ms = period.tv_nsec / 1000000;
 
-	seq_puts(m, "Timer Stats Version: v0.1\n");
+	seq_puts(m, "Timer Stats Version: v0.2\n");
 	seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
 	if (atomic_read(&overflow_count))
 		seq_printf(m, "Overflow: %d entries\n",
@@ -303,8 +306,13 @@ static int tstats_show(struct seq_file *m, void *v)
 
 	for (i = 0; i < nr_entries; i++) {
 		entry = entries + i;
-		seq_printf(m, "%4lu, %5d %-16s ",
+ 		if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
+			seq_printf(m, "%4luD, %5d %-16s ",
 				entry->count, entry->pid, entry->comm);
+		} else {
+			seq_printf(m, " %4lu, %5d %-16s ",
+				entry->count, entry->pid, entry->comm);
+		}
 
 		print_name_offset(m, (unsigned long)entry->start_func);
 		seq_puts(m, " (");
diff --git a/kernel/timer.c b/kernel/timer.c
index 1ab3106a2b5d..1258371e0d2b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -305,6 +305,20 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
 	memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
 	timer->start_pid = current->pid;
 }
+
+static void timer_stats_account_timer(struct timer_list *timer)
+{
+	unsigned int flag = 0;
+
+	if (unlikely(tbase_get_deferrable(timer->base)))
+		flag |= TIMER_STATS_FLAG_DEFERRABLE;
+
+	timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
+				 timer->function, timer->start_comm, flag);
+}
+
+#else
+static void timer_stats_account_timer(struct timer_list *timer) {}
 #endif
 
 /**
-- 
cgit v1.3-8-gc7d7


From 4a19542e5f694cd408a32c3d9dc593ba9366e2d7 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Sun, 15 Jul 2007 23:40:34 -0700
Subject: O_CLOEXEC for SCM_RIGHTS

Part two in the O_CLOEXEC saga: adding support for file descriptors received
through Unix domain sockets.

The patch is once again pretty minimal, it introduces a new flag for recvmsg
and passes it just like the existing MSG_CMSG_COMPAT flag.  I think this bit
is not used otherwise but the networking people will know better.

This new flag is not recognized by recvfrom and recv.  These functions cannot
be used for that purpose and the asymmetry this introduces is not worse than
the already existing MSG_CMSG_COMPAT situations.

The patch must be applied on the patch which introduced O_CLOEXEC.  It has to
remove static from the new get_unused_fd_flags function but since scm.c cannot
live in a module the function still hasn't to be exported.

Here's a test program to make sure the code works.  It's so much longer than
the actual patch...

#include <errno.h>
#include <error.h>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/socket.h>
#include <sys/un.h>

#ifndef O_CLOEXEC
# define O_CLOEXEC 02000000
#endif
#ifndef MSG_CMSG_CLOEXEC
# define MSG_CMSG_CLOEXEC 0x40000000
#endif

int
main (int argc, char *argv[])
{
  if (argc > 1)
    {
      int fd = atol (argv[1]);
      printf ("child: fd = %d\n", fd);
      if (fcntl (fd, F_GETFD) == 0 || errno != EBADF)
        {
          puts ("file descriptor valid in child");
          return 1;
        }
      return 0;

    }

  struct sockaddr_un sun;
  strcpy (sun.sun_path, "./testsocket");
  sun.sun_family = AF_UNIX;

  char databuf[] = "hello";
  struct iovec iov[1];
  iov[0].iov_base = databuf;
  iov[0].iov_len = sizeof (databuf);

  union
  {
    struct cmsghdr hdr;
    char bytes[CMSG_SPACE (sizeof (int))];
  } buf;
  struct msghdr msg = { .msg_iov = iov, .msg_iovlen = 1,
                        .msg_control = buf.bytes,
                        .msg_controllen = sizeof (buf) };
  struct cmsghdr *cmsg = CMSG_FIRSTHDR (&msg);

  cmsg->cmsg_level = SOL_SOCKET;
  cmsg->cmsg_type = SCM_RIGHTS;
  cmsg->cmsg_len = CMSG_LEN (sizeof (int));

  msg.msg_controllen = cmsg->cmsg_len;

  pid_t child = fork ();
  if (child == -1)
    error (1, errno, "fork");
  if (child == 0)
    {
      int sock = socket (PF_UNIX, SOCK_STREAM, 0);
      if (sock < 0)
        error (1, errno, "socket");

      if (bind (sock, (struct sockaddr *) &sun, sizeof (sun)) < 0)
        error (1, errno, "bind");
      if (listen (sock, SOMAXCONN) < 0)
        error (1, errno, "listen");

      int conn = accept (sock, NULL, NULL);
      if (conn == -1)
        error (1, errno, "accept");

      *(int *) CMSG_DATA (cmsg) = sock;
      if (sendmsg (conn, &msg, MSG_NOSIGNAL) < 0)
        error (1, errno, "sendmsg");

      return 0;
    }

  /* For a test suite this should be more robust like a
     barrier in shared memory.  */
  sleep (1);

  int sock = socket (PF_UNIX, SOCK_STREAM, 0);
  if (sock < 0)
    error (1, errno, "socket");

  if (connect (sock, (struct sockaddr *) &sun, sizeof (sun)) < 0)
    error (1, errno, "connect");
  unlink (sun.sun_path);

  *(int *) CMSG_DATA (cmsg) = -1;

  if (recvmsg (sock, &msg, MSG_CMSG_CLOEXEC) < 0)
    error (1, errno, "recvmsg");

  int fd = *(int *) CMSG_DATA (cmsg);
  if (fd == -1)
    error (1, 0, "no descriptor received");

  char fdname[20];
  snprintf (fdname, sizeof (fdname), "%d", fd);
  execl ("/proc/self/exe", argv[0], fdname, NULL);
  puts ("execl failed");
  return 1;
}

[akpm@linux-foundation.org: Fix fastcall inconsistency noted by Michael Buesch]
[akpm@linux-foundation.org: build fix]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Michael Buesch <mb@bu3sch.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/open.c              | 2 +-
 include/linux/file.h   | 1 +
 include/linux/socket.h | 3 +++
 net/compat.c           | 3 ++-
 net/core/scm.c         | 3 ++-
 net/socket.c           | 4 +---
 6 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/open.c b/fs/open.c
index e6991c1b5874..be6a457f4226 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -855,7 +855,7 @@ EXPORT_SYMBOL(dentry_open);
 /*
  * Find an empty file descriptor entry, and mark it busy.
  */
-static int get_unused_fd_flags(int flags)
+int get_unused_fd_flags(int flags)
 {
 	struct files_struct * files = current->files;
 	int fd, error;
diff --git a/include/linux/file.h b/include/linux/file.h
index a59001e9ea58..0114fbc78061 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -73,6 +73,7 @@ extern struct file * FASTCALL(fget_light(unsigned int fd, int *fput_needed));
 extern void FASTCALL(set_close_on_exec(unsigned int fd, int flag));
 extern void put_filp(struct file *);
 extern int get_unused_fd(void);
+extern int get_unused_fd_flags(int flags);
 extern void FASTCALL(put_unused_fd(unsigned int fd));
 struct kmem_cache;
 
diff --git a/include/linux/socket.h b/include/linux/socket.h
index fe195c97a89d..f852e1afd65a 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -253,6 +253,9 @@ struct ucred {
 
 #define MSG_EOF         MSG_FIN
 
+#define MSG_CMSG_CLOEXEC 0x40000000	/* Set close_on_exit for file
+					   descriptor received through
+					   SCM_RIGHTS */
 #if defined(CONFIG_COMPAT)
 #define MSG_CMSG_COMPAT	0x80000000	/* This message needs 32 bit fixups */
 #else
diff --git a/net/compat.c b/net/compat.c
index 9a0f5f2b90c8..d74d82155d78 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -276,7 +276,8 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
 		err = security_file_receive(fp[i]);
 		if (err)
 			break;
-		err = get_unused_fd();
+		err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & kmsg->msg_flags
+					  ? O_CLOEXEC : 0);
 		if (err < 0)
 			break;
 		new_fd = err;
diff --git a/net/core/scm.c b/net/core/scm.c
index 292ad8d5ad76..44c4ec2c8769 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -228,7 +228,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
 		err = security_file_receive(fp[i]);
 		if (err)
 			break;
-		err = get_unused_fd();
+		err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags
+					  ? O_CLOEXEC : 0);
 		if (err < 0)
 			break;
 		new_fd = err;
diff --git a/net/socket.c b/net/socket.c
index f4530196a70a..b71114250046 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1939,9 +1939,7 @@ asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg,
 	total_len = err;
 
 	cmsg_ptr = (unsigned long)msg_sys.msg_control;
-	msg_sys.msg_flags = 0;
-	if (MSG_CMSG_COMPAT & flags)
-		msg_sys.msg_flags = MSG_CMSG_COMPAT;
+	msg_sys.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
 
 	if (sock->file->f_flags & O_NONBLOCK)
 		flags |= MSG_DONTWAIT;
-- 
cgit v1.3-8-gc7d7


From aa0ac36518be648dda3a32f0b37a8b2b546e1b24 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 15 Jul 2007 23:40:39 -0700
Subject: Remove capability.h from mm.h

I forgot to remove capability.h from mm.h while removing sched.h!  This
patch remedies that, because the only inline function which was using
CAP_something was made out of line.

Cross-compile tested without regressions on:

	all powerpc defconfigs
	all mips defconfigs
	all m68k defconfigs
	all arm defconfigs
	all ia64 defconfigs

	alpha alpha-allnoconfig alpha-defconfig alpha-up
	arm
	i386 i386-allnoconfig i386-defconfig i386-up
	ia64 ia64-allnoconfig ia64-defconfig ia64-up
	m68k
	mips
	parisc parisc-allnoconfig parisc-defconfig parisc-up
	powerpc powerpc-up
	s390 s390-allnoconfig s390-defconfig s390-up
	sparc sparc-allnoconfig sparc-defconfig sparc-up
	sparc64 sparc64-allnoconfig sparc64-defconfig sparc64-up
	um-x86_64
	x86_64 x86_64-allnoconfig x86_64-defconfig x86_64-up

as well as my two usual configs.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pci/pci-sysfs.c | 2 +-
 drivers/pci/proc.c      | 2 +-
 fs/gfs2/eaops.c         | 1 +
 include/linux/mm.h      | 1 -
 kernel/time/ntp.c       | 2 +-
 5 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 10dbdec80416..1b7b2812bf2d 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -20,7 +20,7 @@
 #include <linux/stat.h>
 #include <linux/topology.h>
 #include <linux/mm.h>
-
+#include <linux/capability.h>
 #include "pci.h"
 
 static int sysfs_initialized;	/* = 0 */
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index cfa0dfe61b1a..90adc62d07ff 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -11,7 +11,7 @@
 #include <linux/module.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-
+#include <linux/capability.h>
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
 #include "pci.h"
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index c1f44009853f..1ab3e9d73886 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -11,6 +11,7 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
+#include <linux/capability.h>
 #include <linux/xattr.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d4ab4e590e23..97d0cddfd223 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2,7 +2,6 @@
 #define _LINUX_MM_H
 
 #include <linux/errno.h>
-#include <linux/capability.h>
 
 #ifdef __KERNEL__
 
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index cf53bb5814cb..438c6b723ee2 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -13,7 +13,7 @@
 #include <linux/timex.h>
 #include <linux/jiffies.h>
 #include <linux/hrtimer.h>
-
+#include <linux/capability.h>
 #include <asm/div64.h>
 #include <asm/timex.h>
 
-- 
cgit v1.3-8-gc7d7


From 759448f459234bfcf34b82471f0dba77a9aca498 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@linux01.gwdg.de>
Date: Sun, 15 Jul 2007 23:40:40 -0700
Subject: Kernel utf-8 handling

This patch fixes dead keys and copy/paste of non-ASCII characters in UTF-8
mode on Linux console.  See more details about the original patch at:
http://chris.heathens.co.nz/linux/utf8.html

Already posted on
	(Oldest) http://lkml.org/lkml/2003/5/31/148
	         http://lkml.org/lkml/2005/12/24/69
	(Recent) http://lkml.org/lkml/2006/8/7/75

[bunk@stusta.de: make drivers/char/selection.c:store_utf8() static]
Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Cc: Alexander E. Patrakov <patrakov@ums.usu.ru>
Cc: Dmitry Torokhov <dtor@mail.ru>
Cc: "Antonino A. Daplas" <adaplas@pol.net>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/consolemap.c  | 78 ++++++++++++++++++++++++++++++++++++++++++----
 drivers/char/keyboard.c    | 26 +++++++++++-----
 drivers/char/selection.c   | 48 +++++++++++++++++++++++-----
 include/linux/consolemap.h |  5 ++-
 4 files changed, 134 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/consolemap.c b/drivers/char/consolemap.c
index fd40b959afdd..4b3916f54909 100644
--- a/drivers/char/consolemap.c
+++ b/drivers/char/consolemap.c
@@ -177,6 +177,7 @@ struct uni_pagedir {
 	unsigned long	refcount;
 	unsigned long	sum;
 	unsigned char	*inverse_translations[4];
+	u16		*inverse_trans_unicode;
 	int		readonly;
 };
 
@@ -207,6 +208,41 @@ static void set_inverse_transl(struct vc_data *conp, struct uni_pagedir *p, int
 	}
 }
 
+static void set_inverse_trans_unicode(struct vc_data *conp,
+				      struct uni_pagedir *p)
+{
+	int i, j, k, glyph;
+	u16 **p1, *p2;
+	u16 *q;
+
+	if (!p) return;
+	q = p->inverse_trans_unicode;
+	if (!q) {
+		q = p->inverse_trans_unicode =
+			kmalloc(MAX_GLYPH * sizeof(u16), GFP_KERNEL);
+		if (!q)
+			return;
+	}
+	memset(q, 0, MAX_GLYPH * sizeof(u16));
+
+	for (i = 0; i < 32; i++) {
+		p1 = p->uni_pgdir[i];
+		if (!p1)
+			continue;
+		for (j = 0; j < 32; j++) {
+			p2 = p1[j];
+			if (!p2)
+				continue;
+			for (k = 0; k < 64; k++) {
+				glyph = p2[k];
+				if (glyph >= 0 && glyph < MAX_GLYPH
+					       && q[glyph] < 32)
+		  			q[glyph] = (i << 11) + (j << 6) + k;
+			}
+		}
+	}
+}
+
 unsigned short *set_translate(int m, struct vc_data *vc)
 {
 	inv_translate[vc->vc_num] = m;
@@ -217,19 +253,29 @@ unsigned short *set_translate(int m, struct vc_data *vc)
  * Inverse translation is impossible for several reasons:
  * 1. The font<->character maps are not 1-1.
  * 2. The text may have been written while a different translation map
- *    was active, or using Unicode.
+ *    was active.
  * Still, it is now possible to a certain extent to cut and paste non-ASCII.
  */
-unsigned char inverse_translate(struct vc_data *conp, int glyph)
+u16 inverse_translate(struct vc_data *conp, int glyph, int use_unicode)
 {
 	struct uni_pagedir *p;
+	int m;
 	if (glyph < 0 || glyph >= MAX_GLYPH)
 		return 0;
-	else if (!(p = (struct uni_pagedir *)*conp->vc_uni_pagedir_loc) ||
-		 !p->inverse_translations[inv_translate[conp->vc_num]])
+	else if (!(p = (struct uni_pagedir *)*conp->vc_uni_pagedir_loc))
 		return glyph;
-	else
-		return p->inverse_translations[inv_translate[conp->vc_num]][glyph];
+	else if (use_unicode) {
+		if (!p->inverse_trans_unicode)
+			return glyph;
+		else
+			return p->inverse_trans_unicode[glyph];
+	} else {
+		m = inv_translate[conp->vc_num];
+		if (!p->inverse_translations[m])
+			return glyph;
+		else
+			return p->inverse_translations[m][glyph];
+	}
 }
 
 static void update_user_maps(void)
@@ -243,6 +289,7 @@ static void update_user_maps(void)
 		p = (struct uni_pagedir *)*vc_cons[i].d->vc_uni_pagedir_loc;
 		if (p && p != q) {
 			set_inverse_transl(vc_cons[i].d, p, USER_MAP);
+			set_inverse_trans_unicode(vc_cons[i].d, p);
 			q = p;
 		}
 	}
@@ -353,6 +400,10 @@ static void con_release_unimap(struct uni_pagedir *p)
 		kfree(p->inverse_translations[i]);
 		p->inverse_translations[i] = NULL;
 	}
+	if (p->inverse_trans_unicode) {
+		kfree(p->inverse_trans_unicode);
+		p->inverse_trans_unicode = NULL;
+	}
 }
 
 void con_free_unimap(struct vc_data *vc)
@@ -511,6 +562,7 @@ int con_set_unimap(struct vc_data *vc, ushort ct, struct unipair __user *list)
 
 	for (i = 0; i <= 3; i++)
 		set_inverse_transl(vc, p, i); /* Update all inverse translations */
+	set_inverse_trans_unicode(vc, p);
   
 	return err;
 }
@@ -561,6 +613,7 @@ int con_set_default_unimap(struct vc_data *vc)
 
 	for (i = 0; i <= 3; i++)
 		set_inverse_transl(vc, p, i);	/* Update all inverse translations */
+	set_inverse_trans_unicode(vc, p);
 	dflt = p;
 	return err;
 }
@@ -617,6 +670,19 @@ void con_protect_unimap(struct vc_data *vc, int rdonly)
 		p->readonly = rdonly;
 }
 
+/* may be called during an interrupt */
+u32 conv_8bit_to_uni(unsigned char c)
+{
+	/*
+	 * Always use USER_MAP. This function is used by the keyboard,
+	 * which shouldn't be affected by G0/G1 switching, etc.
+	 * If the user map still contains default values, i.e. the
+	 * direct-to-font mapping, then assume user is using Latin1.
+	 */
+	unsigned short uni = translations[USER_MAP][c];
+	return uni == (0xf000 | c) ? c : uni;
+}
+
 int
 conv_uni_to_pc(struct vc_data *conp, long ucs) 
 {
diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c
index 90965b4def5c..2ce0af1bd588 100644
--- a/drivers/char/keyboard.c
+++ b/drivers/char/keyboard.c
@@ -24,6 +24,7 @@
  * 21-08-02: Converted to input API, major cleanup. (Vojtech Pavlik)
  */
 
+#include <linux/consolemap.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/tty.h>
@@ -308,10 +309,9 @@ static void applkey(struct vc_data *vc, int key, char mode)
  * Many other routines do put_queue, but I think either
  * they produce ASCII, or they produce some user-assigned
  * string, and in both cases we might assume that it is
- * in utf-8 already. UTF-8 is defined for words of up to 31 bits,
- * but we need only 16 bits here
+ * in utf-8 already.
  */
-static void to_utf8(struct vc_data *vc, ushort c)
+static void to_utf8(struct vc_data *vc, uint c)
 {
 	if (c < 0x80)
 		/*  0******* */
@@ -320,11 +320,21 @@ static void to_utf8(struct vc_data *vc, ushort c)
 		/* 110***** 10****** */
 		put_queue(vc, 0xc0 | (c >> 6));
 		put_queue(vc, 0x80 | (c & 0x3f));
-	} else {
+    	} else if (c < 0x10000) {
+	       	if (c >= 0xD800 && c < 0xE000)
+			return;
+		if (c == 0xFFFF)
+			return;
 		/* 1110**** 10****** 10****** */
 		put_queue(vc, 0xe0 | (c >> 12));
 		put_queue(vc, 0x80 | ((c >> 6) & 0x3f));
 		put_queue(vc, 0x80 | (c & 0x3f));
+    	} else if (c < 0x110000) {
+		/* 11110*** 10****** 10****** 10****** */
+		put_queue(vc, 0xf0 | (c >> 18));
+		put_queue(vc, 0x80 | ((c >> 12) & 0x3f));
+		put_queue(vc, 0x80 | ((c >> 6) & 0x3f));
+		put_queue(vc, 0x80 | (c & 0x3f));
 	}
 }
 
@@ -393,7 +403,7 @@ static unsigned int handle_diacr(struct vc_data *vc, unsigned int ch)
 		return d;
 
 	if (kbd->kbdmode == VC_UNICODE)
-		to_utf8(vc, d);
+		to_utf8(vc, conv_8bit_to_uni(d));
 	else if (d < 0x100)
 		put_queue(vc, d);
 
@@ -407,7 +417,7 @@ static void fn_enter(struct vc_data *vc)
 {
 	if (diacr) {
 		if (kbd->kbdmode == VC_UNICODE)
-			to_utf8(vc, diacr);
+			to_utf8(vc, conv_8bit_to_uni(diacr));
 		else if (diacr < 0x100)
 			put_queue(vc, diacr);
 		diacr = 0;
@@ -617,7 +627,7 @@ static void k_unicode(struct vc_data *vc, unsigned int value, char up_flag)
 		return;
 	}
 	if (kbd->kbdmode == VC_UNICODE)
-		to_utf8(vc, value);
+		to_utf8(vc, conv_8bit_to_uni(value));
 	else if (value < 0x100)
 		put_queue(vc, value);
 }
@@ -775,7 +785,7 @@ static void k_shift(struct vc_data *vc, unsigned char value, char up_flag)
 	/* kludge */
 	if (up_flag && shift_state != old_state && npadch != -1) {
 		if (kbd->kbdmode == VC_UNICODE)
-			to_utf8(vc, npadch & 0xffff);
+			to_utf8(vc, npadch);
 		else
 			put_queue(vc, npadch & 0xff);
 		npadch = -1;
diff --git a/drivers/char/selection.c b/drivers/char/selection.c
index a69f094d1ed3..d63f5ccc29e6 100644
--- a/drivers/char/selection.c
+++ b/drivers/char/selection.c
@@ -20,6 +20,7 @@
 
 #include <asm/uaccess.h>
 
+#include <linux/kbd_kern.h>
 #include <linux/vt_kern.h>
 #include <linux/consolemap.h>
 #include <linux/selection.h>
@@ -34,6 +35,7 @@ extern void poke_blanked_console(void);
 /* Variables for selection control. */
 /* Use a dynamic buffer, instead of static (Dec 1994) */
 struct vc_data *sel_cons;		/* must not be deallocated */
+static int use_unicode;
 static volatile int sel_start = -1; 	/* cleared by clear_selection */
 static int sel_end;
 static int sel_buffer_lth;
@@ -54,10 +56,11 @@ static inline void highlight_pointer(const int where)
 	complement_pos(sel_cons, where);
 }
 
-static unsigned char
+static u16
 sel_pos(int n)
 {
-	return inverse_translate(sel_cons, screen_glyph(sel_cons, n));
+	return inverse_translate(sel_cons, screen_glyph(sel_cons, n),
+				use_unicode);
 }
 
 /* remove the current selection highlight, if any,
@@ -86,8 +89,8 @@ static u32 inwordLut[8]={
   0xFF7FFFFF  /* latin-1 accented letters, not division sign */
 };
 
-static inline int inword(const unsigned char c) {
-	return ( inwordLut[c>>5] >> (c & 0x1F) ) & 1;
+static inline int inword(const u16 c) {
+	return c > 0xff || (( inwordLut[c>>5] >> (c & 0x1F) ) & 1);
 }
 
 /* set inwordLut contents. Invoked by ioctl(). */
@@ -108,13 +111,36 @@ static inline unsigned short limit(const unsigned short v, const unsigned short
 	return (v > u) ? u : v;
 }
 
+/* stores the char in UTF8 and returns the number of bytes used (1-3) */
+static int store_utf8(u16 c, char *p)
+{
+	if (c < 0x80) {
+		/*  0******* */
+		p[0] = c;
+		return 1;
+	} else if (c < 0x800) {
+		/* 110***** 10****** */
+		p[0] = 0xc0 | (c >> 6);
+		p[1] = 0x80 | (c & 0x3f);
+		return 2;
+    	} else {
+		/* 1110**** 10****** 10****** */
+		p[0] = 0xe0 | (c >> 12);
+		p[1] = 0x80 | ((c >> 6) & 0x3f);
+		p[2] = 0x80 | (c & 0x3f);
+		return 3;
+    	}
+}
+
 /* set the current selection. Invoked by ioctl() or by kernel code. */
 int set_selection(const struct tiocl_selection __user *sel, struct tty_struct *tty)
 {
 	struct vc_data *vc = vc_cons[fg_console].d;
 	int sel_mode, new_sel_start, new_sel_end, spc;
 	char *bp, *obp;
-	int i, ps, pe;
+	int i, ps, pe, multiplier;
+	u16 c;
+	struct kbd_struct *kbd = kbd_table + fg_console;
 
 	poke_blanked_console();
 
@@ -158,6 +184,7 @@ int set_selection(const struct tiocl_selection __user *sel, struct tty_struct *t
 		clear_selection();
 		sel_cons = vc_cons[fg_console].d;
 	}
+	use_unicode = kbd && kbd->kbdmode == VC_UNICODE;
 
 	switch (sel_mode)
 	{
@@ -240,7 +267,8 @@ int set_selection(const struct tiocl_selection __user *sel, struct tty_struct *t
 	sel_end = new_sel_end;
 
 	/* Allocate a new buffer before freeing the old one ... */
-	bp = kmalloc((sel_end-sel_start)/2+1, GFP_KERNEL);
+	multiplier = use_unicode ? 3 : 1;  /* chars can take up to 3 bytes */
+	bp = kmalloc((sel_end-sel_start)/2*multiplier+1, GFP_KERNEL);
 	if (!bp) {
 		printk(KERN_WARNING "selection: kmalloc() failed\n");
 		clear_selection();
@@ -251,8 +279,12 @@ int set_selection(const struct tiocl_selection __user *sel, struct tty_struct *t
 
 	obp = bp;
 	for (i = sel_start; i <= sel_end; i += 2) {
-		*bp = sel_pos(i);
-		if (!isspace(*bp++))
+		c = sel_pos(i);
+		if (use_unicode)
+			bp += store_utf8(c, bp);
+		else
+			*bp++ = c;
+		if (!isspace(c))
 			obp = bp;
 		if (! ((i + 2) % vc->vc_size_row)) {
 			/* strip trailing blanks from line and add newline,
diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h
index 82c9a1f11020..06b2768c603f 100644
--- a/include/linux/consolemap.h
+++ b/include/linux/consolemap.h
@@ -8,9 +8,12 @@
 #define IBMPC_MAP 2
 #define USER_MAP 3
 
+#include <linux/types.h>
+
 struct vc_data;
 
-extern unsigned char inverse_translate(struct vc_data *conp, int glyph);
+extern u16 inverse_translate(struct vc_data *conp, int glyph, int use_unicode);
 extern unsigned short *set_translate(int m, struct vc_data *vc);
 extern int conv_uni_to_pc(struct vc_data *conp, long ucs);
+extern u32 conv_8bit_to_uni(unsigned char c);
 void console_map_init(void);
-- 
cgit v1.3-8-gc7d7


From c289dca37917338fc8ab2e0d7e202a1c927e229e Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 15 Jul 2007 23:40:42 -0700
Subject: remove sonypi_camera_command()

Remove the no longer used sonypi_camera_command().

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Mattia Dongili <malattia@linux.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/sonypi.c  | 47 -----------------------------------------------
 include/linux/sonypi.h |  2 --
 2 files changed, 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/sonypi.c b/drivers/char/sonypi.c
index 3ef593a9015f..73037a4d3c50 100644
--- a/drivers/char/sonypi.c
+++ b/drivers/char/sonypi.c
@@ -885,53 +885,6 @@ found:
 	return IRQ_HANDLED;
 }
 
-/* External camera command (exported to the motion eye v4l driver) */
-int sonypi_camera_command(int command, u8 value)
-{
-	if (!camera)
-		return -EIO;
-
-	mutex_lock(&sonypi_device.lock);
-
-	switch (command) {
-	case SONYPI_COMMAND_SETCAMERA:
-		if (value)
-			sonypi_camera_on();
-		else
-			sonypi_camera_off();
-		break;
-	case SONYPI_COMMAND_SETCAMERABRIGHTNESS:
-		sonypi_set(SONYPI_CAMERA_BRIGHTNESS, value);
-		break;
-	case SONYPI_COMMAND_SETCAMERACONTRAST:
-		sonypi_set(SONYPI_CAMERA_CONTRAST, value);
-		break;
-	case SONYPI_COMMAND_SETCAMERAHUE:
-		sonypi_set(SONYPI_CAMERA_HUE, value);
-		break;
-	case SONYPI_COMMAND_SETCAMERACOLOR:
-		sonypi_set(SONYPI_CAMERA_COLOR, value);
-		break;
-	case SONYPI_COMMAND_SETCAMERASHARPNESS:
-		sonypi_set(SONYPI_CAMERA_SHARPNESS, value);
-		break;
-	case SONYPI_COMMAND_SETCAMERAPICTURE:
-		sonypi_set(SONYPI_CAMERA_PICTURE, value);
-		break;
-	case SONYPI_COMMAND_SETCAMERAAGC:
-		sonypi_set(SONYPI_CAMERA_AGC, value);
-		break;
-	default:
-		printk(KERN_ERR "sonypi: sonypi_camera_command invalid: %d\n",
-		       command);
-		break;
-	}
-	mutex_unlock(&sonypi_device.lock);
-	return 0;
-}
-
-EXPORT_SYMBOL(sonypi_camera_command);
-
 static int sonypi_misc_fasync(int fd, struct file *filp, int on)
 {
 	int retval;
diff --git a/include/linux/sonypi.h b/include/linux/sonypi.h
index 34d4b075f7b8..40c7b5d993b9 100644
--- a/include/linux/sonypi.h
+++ b/include/linux/sonypi.h
@@ -153,8 +153,6 @@
 #define SONYPI_COMMAND_GETCAMERAROMVERSION	18	/* obsolete */
 #define SONYPI_COMMAND_GETCAMERAREVISION	19	/* obsolete */
 
-int sonypi_camera_command(int command, u8 value);
-
 #endif				/* __KERNEL__ */
 
 #endif				/* _SONYPI_H_ */
-- 
cgit v1.3-8-gc7d7


From dcae56ea661e13d8f904b584bbe4c1e50c7ee548 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@mindspring.com>
Date: Sun, 15 Jul 2007 23:40:45 -0700
Subject: Drop an empty isicom.h from being exported to user space.

Drop <linux/isicom.h> from being exported to user space since it would
be only an empty file.

Signed-off-by: Robert P. J. Day <rpjday@mindspring.com>
Acked-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/Kbuild | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 127d2d192b5a..bcf875e844fe 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -247,7 +247,6 @@ unifdef-y += isdn.h
 unifdef-y += isdnif.h
 unifdef-y += isdn_divertif.h
 unifdef-y += isdn_ppp.h
-unifdef-y += isicom.h
 unifdef-y += jbd.h
 unifdef-y += joystick.h
 unifdef-y += kdev_t.h
-- 
cgit v1.3-8-gc7d7


From b663a79c191508f27cd885224b592a878c0ba0f6 Mon Sep 17 00:00:00 2001
From: Maxim Uvarov <muvarov@ru.mvista.com>
Date: Sun, 15 Jul 2007 23:40:48 -0700
Subject: taskstats: add context-switch counters

Make available to the user the following task and process performance
statistics:

	* Involuntary Context Switches (task_struct->nivcsw)
	* Voluntary Context Switches (task_struct->nvcsw)

Statistics information is available from:
	1. taskstats interface (Documentation/accounting/)
	2. /proc/PID/status (task only).

This data is useful for detecting hyperactivity patterns between processes.

[akpm@linux-foundation.org: cleanup]
Signed-off-by: Maxim Uvarov <muvarov@ru.mvista.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Cc: Jonathan Lim <jlim@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/accounting/getdelays.c          | 19 +++++++++++++++++--
 Documentation/accounting/taskstats-struct.txt |  6 ++++++
 fs/proc/array.c                               | 10 ++++++++++
 include/linux/taskstats.h                     |  5 ++++-
 kernel/taskstats.c                            |  4 ++++
 5 files changed, 41 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c
index 71acc28ed0d1..24c5aade8998 100644
--- a/Documentation/accounting/getdelays.c
+++ b/Documentation/accounting/getdelays.c
@@ -49,6 +49,7 @@ char name[100];
 int dbg;
 int print_delays;
 int print_io_accounting;
+int print_task_context_switch_counts;
 __u64 stime, utime;
 
 #define PRINTF(fmt, arg...) {			\
@@ -195,7 +196,7 @@ void print_delayacct(struct taskstats *t)
 	       "IO    %15s%15s\n"
 	       "      %15llu%15llu\n"
 	       "MEM   %15s%15s\n"
-	       "      %15llu%15llu\n\n",
+	       "      %15llu%15llu\n"
 	       "count", "real total", "virtual total", "delay total",
 	       t->cpu_count, t->cpu_run_real_total, t->cpu_run_virtual_total,
 	       t->cpu_delay_total,
@@ -204,6 +205,14 @@ void print_delayacct(struct taskstats *t)
 	       "count", "delay total", t->swapin_count, t->swapin_delay_total);
 }
 
+void task_context_switch_counts(struct taskstats *t)
+{
+	printf("\n\nTask   %15s%15s\n"
+	       "       %15lu%15lu\n",
+	       "voluntary", "nonvoluntary",
+	       t->nvcsw, t->nivcsw);
+}
+
 void print_ioacct(struct taskstats *t)
 {
 	printf("%s: read=%llu, write=%llu, cancelled_write=%llu\n",
@@ -235,7 +244,7 @@ int main(int argc, char *argv[])
 	struct msgtemplate msg;
 
 	while (1) {
-		c = getopt(argc, argv, "diw:r:m:t:p:vl");
+		c = getopt(argc, argv, "qdiw:r:m:t:p:vl");
 		if (c < 0)
 			break;
 
@@ -248,6 +257,10 @@ int main(int argc, char *argv[])
 			printf("printing IO accounting\n");
 			print_io_accounting = 1;
 			break;
+		case 'q':
+			printf("printing task/process context switch rates\n");
+			print_task_context_switch_counts = 1;
+			break;
 		case 'w':
 			logfile = strdup(optarg);
 			printf("write to file %s\n", logfile);
@@ -389,6 +402,8 @@ int main(int argc, char *argv[])
 							print_delayacct((struct taskstats *) NLA_DATA(na));
 						if (print_io_accounting)
 							print_ioacct((struct taskstats *) NLA_DATA(na));
+						if (print_task_context_switch_counts)
+							task_context_switch_counts((struct taskstats *) NLA_DATA(na));
 						if (fd) {
 							if (write(fd, NLA_DATA(na), na->nla_len) < 0) {
 								err(1,"write error\n");
diff --git a/Documentation/accounting/taskstats-struct.txt b/Documentation/accounting/taskstats-struct.txt
index 661c797eaf79..8aa7529f8258 100644
--- a/Documentation/accounting/taskstats-struct.txt
+++ b/Documentation/accounting/taskstats-struct.txt
@@ -22,6 +22,8 @@ There are three different groups of fields in the struct taskstats:
     /* Extended accounting fields end */
     Their values are collected if CONFIG_TASK_XACCT is set.
 
+4) Per-task and per-thread context switch count statistics
+
 Future extension should add fields to the end of the taskstats struct, and
 should not change the relative position of each field within the struct.
 
@@ -158,4 +160,8 @@ struct taskstats {
 
 	/* Extended accounting fields end */
 
+4) Per-task and per-thread statistics
+	__u64	nvcsw;			/* Context voluntary switch counter */
+	__u64	nivcsw;			/* Context involuntary switch counter */
+
 }
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 680c913575f0..9cbab7e93557 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -289,6 +289,15 @@ static inline char *task_cap(struct task_struct *p, char *buffer)
 			    cap_t(p->cap_effective));
 }
 
+static inline char *task_context_switch_counts(struct task_struct *p,
+						char *buffer)
+{
+	return buffer + sprintf(buffer, "voluntary_ctxt_switches:\t%lu\n"
+			    "nonvoluntary_ctxt_switches:\t%lu\n",
+			    p->nvcsw,
+			    p->nivcsw);
+}
+
 int proc_pid_status(struct task_struct *task, char * buffer)
 {
 	char * orig = buffer;
@@ -307,6 +316,7 @@ int proc_pid_status(struct task_struct *task, char * buffer)
 #if defined(CONFIG_S390)
 	buffer = task_show_regs(task, buffer);
 #endif
+	buffer = task_context_switch_counts(task, buffer);
 	return buffer - orig;
 }
 
diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h
index a46104a28f66..dce1ed204972 100644
--- a/include/linux/taskstats.h
+++ b/include/linux/taskstats.h
@@ -31,7 +31,7 @@
  */
 
 
-#define TASKSTATS_VERSION	4
+#define TASKSTATS_VERSION	5
 #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
 					 * in linux/sched.h */
 
@@ -149,6 +149,9 @@ struct taskstats {
 	__u64	read_bytes;		/* bytes of read I/O */
 	__u64	write_bytes;		/* bytes of write I/O */
 	__u64	cancelled_write_bytes;	/* bytes of cancelled write I/O */
+
+	__u64  nvcsw;			/* voluntary_ctxt_switches */
+	__u64  nivcsw;			/* nonvoluntary_ctxt_switches */
 };
 
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 906cae771585..059431ed67db 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -196,6 +196,8 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
 
 	/* fill in basic acct fields */
 	stats->version = TASKSTATS_VERSION;
+	stats->nvcsw = tsk->nvcsw;
+	stats->nivcsw = tsk->nivcsw;
 	bacct_add_tsk(stats, tsk);
 
 	/* fill in extended acct fields */
@@ -242,6 +244,8 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
 		 */
 		delayacct_add_tsk(stats, tsk);
 
+		stats->nvcsw += tsk->nvcsw;
+		stats->nivcsw += tsk->nivcsw;
 	} while_each_thread(first, tsk);
 
 	unlock_task_sighand(first, &flags);
-- 
cgit v1.3-8-gc7d7


From 5216184571946b8bbf06f0cd630c7754190fdd1a Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Sun, 15 Jul 2007 23:40:51 -0700
Subject: fix typo in prefetch.h

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/prefetch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/prefetch.h b/include/linux/prefetch.h
index fc86f274147f..1adfe668d031 100644
--- a/include/linux/prefetch.h
+++ b/include/linux/prefetch.h
@@ -27,7 +27,7 @@
 	
 	prefetch(x)  	- prefetches the cacheline at "x" for read
 	prefetchw(x)	- prefetches the cacheline at "x" for write
-	spin_lock_prefetch(x) - prefectches the spinlock *x for taking
+	spin_lock_prefetch(x) - prefetches the spinlock *x for taking
 	
 	there is also PREFETCH_STRIDE which is the architecure-prefered 
 	"lookahead" size for prefetching streamed operations.
-- 
cgit v1.3-8-gc7d7


From 4f27c00bf80f122513d3a5be16ed851573164534 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Sun, 15 Jul 2007 23:40:55 -0700
Subject: Improve behaviour of spurious IRQ detect

Currently we handle spurious IRQ activity based upon seeing a lot of
invalid interrupts, and we clear things back on the base of lots of valid
interrupts.

Unfortunately in some cases you get legitimate invalid interrupts caused by
timing asynchronicity between the PCI bus and the APIC bus when disabling
interrupts and pulling other tricks.  In this case although the spurious
IRQs are not a problem our unhandled counters didn't clear and they act as
a slow running timebomb.  (This is effectively what the serial port/tty
problem that was fixed by clearing counters when registering a handler
showed up)

It's easy enough to add a second parameter - time.  This means that if we
see a regular stream of harmless spurious interrupts which are not harming
processing we don't go off and do something stupid like disable the IRQ
after a month of running.  OTOH lockups and performance killers show up a
lot more than 10/second

[akpm@linux-foundation.org: cleanup]
Signed-off-by: Alan Cox <alan@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/irq.h   |  1 +
 kernel/irq/spurious.c | 12 +++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 1695054e8c63..44657197fcb0 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -161,6 +161,7 @@ struct irq_desc {
 	unsigned int		wake_depth;	/* nested wake enables */
 	unsigned int		irq_count;	/* For detecting broken IRQs */
 	unsigned int		irqs_unhandled;
+	unsigned long		last_unhandled;	/* Aging timer for unhandled count */
 	spinlock_t		lock;
 #ifdef CONFIG_SMP
 	cpumask_t		affinity;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index bd9e272d55e9..32b161972fad 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -172,7 +172,17 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 		    irqreturn_t action_ret)
 {
 	if (unlikely(action_ret != IRQ_HANDLED)) {
-		desc->irqs_unhandled++;
+		/*
+		 * If we are seeing only the odd spurious IRQ caused by
+		 * bus asynchronicity then don't eventually trigger an error,
+		 * otherwise the couter becomes a doomsday timer for otherwise
+		 * working systems
+		 */
+		if (jiffies - desc->last_unhandled > HZ/10)
+			desc->irqs_unhandled = 1;
+		else
+			desc->irqs_unhandled++;
+		desc->last_unhandled = jiffies;
 		if (unlikely(action_ret != IRQ_NONE))
 			report_bad_irq(irq, desc, action_ret);
 	}
-- 
cgit v1.3-8-gc7d7


From 522ed7767e800cff6c650ec64b0ee0677303119c Mon Sep 17 00:00:00 2001
From: Miloslav Trmac <mitr@redhat.com>
Date: Sun, 15 Jul 2007 23:40:56 -0700
Subject: Audit: add TTY input auditing

Add TTY input auditing, used to audit system administrator's actions.  This is
required by various security standards such as DCID 6/3 and PCI to provide
non-repudiation of administrator's actions and to allow a review of past
actions if the administrator seems to overstep their duties or if the system
becomes misconfigured for unknown reasons.  These requirements do not make it
necessary to audit TTY output as well.

Compared to an user-space keylogger, this approach records TTY input using the
audit subsystem, correlated with other audit events, and it is completely
transparent to the user-space application (e.g.  the console ioctls still
work).

TTY input auditing works on a higher level than auditing all system calls
within the session, which would produce an overwhelming amount of mostly
useless audit events.

Add an "audit_tty" attribute, inherited across fork ().  Data read from TTYs
by process with the attribute is sent to the audit subsystem by the kernel.
The audit netlink interface is extended to allow modifying the audit_tty
attribute, and to allow sending explanatory audit events from user-space (for
example, a shell might send an event containing the final command, after the
interactive command-line editing and history expansion is performed, which
might be difficult to decipher from the TTY input alone).

Because the "audit_tty" attribute is inherited across fork (), it would be set
e.g.  for sshd restarted within an audited session.  To prevent this, the
audit_tty attribute is cleared when a process with no open TTY file
descriptors (e.g.  after daemon startup) opens a TTY.

See https://www.redhat.com/archives/linux-audit/2007-June/msg00000.html for a
more detailed rationale document for an older version of this patch.

[akpm@linux-foundation.org: build fix]
Signed-off-by: Miloslav Trmac <mitr@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Paul Fulghum <paulkf@microgate.com>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Cc: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/Makefile        |   1 +
 drivers/char/n_tty.c         |  20 ++-
 drivers/char/tty_audit.c     | 345 +++++++++++++++++++++++++++++++++++++++++++
 drivers/char/tty_io.c        |  14 +-
 include/linux/audit.h        |  11 ++
 include/linux/sched.h        |   4 +
 include/linux/tty.h          |  33 +++++
 kernel/audit.c               |  96 +++++++++++-
 kernel/audit.h               |   1 -
 kernel/auditsc.c             |   3 -
 kernel/exit.c                |   2 +
 kernel/fork.c                |   3 +
 net/netlabel/netlabel_user.c |   4 -
 security/selinux/nlmsgtab.c  |   2 +
 14 files changed, 518 insertions(+), 21 deletions(-)
 create mode 100644 drivers/char/tty_audit.c

(limited to 'include/linux')

diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index 2f56ecc035aa..f2996a95eb07 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -15,6 +15,7 @@ obj-y				+= misc.o
 obj-$(CONFIG_VT)		+= vt_ioctl.o vc_screen.o consolemap.o \
 				   consolemap_deftbl.o selection.o keyboard.o
 obj-$(CONFIG_HW_CONSOLE)	+= vt.o defkeymap.o
+obj-$(CONFIG_AUDIT)		+= tty_audit.o
 obj-$(CONFIG_MAGIC_SYSRQ)	+= sysrq.o
 obj-$(CONFIG_ESPSERIAL)		+= esp.o
 obj-$(CONFIG_MVME147_SCC)	+= generic_serial.o vme_scc.o
diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c
index 371631f4bfb9..038056911934 100644
--- a/drivers/char/n_tty.c
+++ b/drivers/char/n_tty.c
@@ -45,6 +45,8 @@
 #include <linux/slab.h>
 #include <linux/poll.h>
 #include <linux/bitops.h>
+#include <linux/audit.h>
+#include <linux/file.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -78,6 +80,13 @@ static inline void free_buf(unsigned char *buf)
 		free_page((unsigned long) buf);
 }
 
+static inline int tty_put_user(struct tty_struct *tty, unsigned char x,
+			       unsigned char __user *ptr)
+{
+	tty_audit_add_data(tty, &x, 1);
+	return put_user(x, ptr);
+}
+
 /**
  *	n_tty_set__room	-	receive space
  *	@tty: terminal
@@ -1153,6 +1162,7 @@ static int copy_from_read_buf(struct tty_struct *tty,
 	if (n) {
 		retval = copy_to_user(*b, &tty->read_buf[tty->read_tail], n);
 		n -= retval;
+		tty_audit_add_data(tty, &tty->read_buf[tty->read_tail], n);
 		spin_lock_irqsave(&tty->read_lock, flags);
 		tty->read_tail = (tty->read_tail + n) & (N_TTY_BUF_SIZE-1);
 		tty->read_cnt -= n;
@@ -1279,7 +1289,7 @@ do_it_again:
 				break;
 			cs = tty->link->ctrl_status;
 			tty->link->ctrl_status = 0;
-			if (put_user(cs, b++)) {
+			if (tty_put_user(tty, cs, b++)) {
 				retval = -EFAULT;
 				b--;
 				break;
@@ -1321,7 +1331,7 @@ do_it_again:
 
 		/* Deal with packet mode. */
 		if (tty->packet && b == buf) {
-			if (put_user(TIOCPKT_DATA, b++)) {
+			if (tty_put_user(tty, TIOCPKT_DATA, b++)) {
 				retval = -EFAULT;
 				b--;
 				break;
@@ -1352,15 +1362,17 @@ do_it_again:
 				spin_unlock_irqrestore(&tty->read_lock, flags);
 
 				if (!eol || (c != __DISABLED_CHAR)) {
-					if (put_user(c, b++)) {
+					if (tty_put_user(tty, c, b++)) {
 						retval = -EFAULT;
 						b--;
 						break;
 					}
 					nr--;
 				}
-				if (eol)
+				if (eol) {
+					tty_audit_push(tty);
 					break;
+				}
 			}
 			if (retval)
 				break;
diff --git a/drivers/char/tty_audit.c b/drivers/char/tty_audit.c
new file mode 100644
index 000000000000..d222012c1b0c
--- /dev/null
+++ b/drivers/char/tty_audit.c
@@ -0,0 +1,345 @@
+/*
+ * Creating audit events from TTY input.
+ *
+ * Copyright (C) 2007 Red Hat, Inc.  All rights reserved.  This copyrighted
+ * material is made available to anyone wishing to use, modify, copy, or
+ * redistribute it subject to the terms and conditions of the GNU General
+ * Public License v.2.
+ *
+ * Authors: Miloslav Trmac <mitr@redhat.com>
+ */
+
+#include <linux/audit.h>
+#include <linux/file.h>
+#include <linux/tty.h>
+
+struct tty_audit_buf {
+	atomic_t count;
+	struct mutex mutex;	/* Protects all data below */
+	int major, minor;	/* The TTY which the data is from */
+	unsigned icanon:1;
+	size_t valid;
+	unsigned char *data;	/* Allocated size N_TTY_BUF_SIZE */
+};
+
+static struct tty_audit_buf *tty_audit_buf_alloc(int major, int minor,
+						 int icanon)
+{
+	struct tty_audit_buf *buf;
+
+	buf = kmalloc(sizeof (*buf), GFP_KERNEL);
+	if (!buf)
+		goto err;
+	if (PAGE_SIZE != N_TTY_BUF_SIZE)
+		buf->data = kmalloc(N_TTY_BUF_SIZE, GFP_KERNEL);
+	else
+		buf->data = (unsigned char *)__get_free_page(GFP_KERNEL);
+	if (!buf->data)
+		goto err_buf;
+	atomic_set(&buf->count, 1);
+	mutex_init(&buf->mutex);
+	buf->major = major;
+	buf->minor = minor;
+	buf->icanon = icanon;
+	buf->valid = 0;
+	return buf;
+
+err_buf:
+	kfree(buf);
+err:
+	return NULL;
+}
+
+static void tty_audit_buf_free(struct tty_audit_buf *buf)
+{
+	WARN_ON(buf->valid != 0);
+	if (PAGE_SIZE != N_TTY_BUF_SIZE)
+		kfree(buf->data);
+	else
+		free_page((unsigned long)buf->data);
+	kfree(buf);
+}
+
+static void tty_audit_buf_put(struct tty_audit_buf *buf)
+{
+	if (atomic_dec_and_test(&buf->count))
+		tty_audit_buf_free(buf);
+}
+
+/**
+ *	tty_audit_buf_push	-	Push buffered data out
+ *
+ *	Generate an audit message from the contents of @buf, which is owned by
+ *	@tsk with @loginuid.  @buf->mutex must be locked.
+ */
+static void tty_audit_buf_push(struct task_struct *tsk, uid_t loginuid,
+			       struct tty_audit_buf *buf)
+{
+	struct audit_buffer *ab;
+
+	if (buf->valid == 0)
+		return;
+	if (audit_enabled == 0)
+		return;
+	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_TTY);
+	if (ab) {
+		char name[sizeof(tsk->comm)];
+
+		audit_log_format(ab, "tty pid=%u uid=%u auid=%u major=%d "
+				 "minor=%d comm=", tsk->pid, tsk->uid,
+				 loginuid, buf->major, buf->minor);
+		get_task_comm(name, tsk);
+		audit_log_untrustedstring(ab, name);
+		audit_log_format(ab, " data=");
+		audit_log_n_untrustedstring(ab, buf->valid, buf->data);
+		audit_log_end(ab);
+	}
+	buf->valid = 0;
+}
+
+/**
+ *	tty_audit_buf_push_current	-	Push buffered data out
+ *
+ *	Generate an audit message from the contents of @buf, which is owned by
+ *	the current task.  @buf->mutex must be locked.
+ */
+static void tty_audit_buf_push_current(struct tty_audit_buf *buf)
+{
+	tty_audit_buf_push(current, audit_get_loginuid(current->audit_context),
+			   buf);
+}
+
+/**
+ *	tty_audit_exit	-	Handle a task exit
+ *
+ *	Make sure all buffered data is written out and deallocate the buffer.
+ *	Only needs to be called if current->signal->tty_audit_buf != %NULL.
+ */
+void tty_audit_exit(void)
+{
+	struct tty_audit_buf *buf;
+
+	spin_lock_irq(&current->sighand->siglock);
+	buf = current->signal->tty_audit_buf;
+	current->signal->tty_audit_buf = NULL;
+	spin_unlock_irq(&current->sighand->siglock);
+	if (!buf)
+		return;
+
+	mutex_lock(&buf->mutex);
+	tty_audit_buf_push_current(buf);
+	mutex_unlock(&buf->mutex);
+
+	tty_audit_buf_put(buf);
+}
+
+/**
+ *	tty_audit_fork	-	Copy TTY audit state for a new task
+ *
+ *	Set up TTY audit state in @sig from current.  @sig needs no locking.
+ */
+void tty_audit_fork(struct signal_struct *sig)
+{
+	spin_lock_irq(&current->sighand->siglock);
+	sig->audit_tty = current->signal->audit_tty;
+	spin_unlock_irq(&current->sighand->siglock);
+	sig->tty_audit_buf = NULL;
+}
+
+/**
+ *	tty_audit_push_task	-	Flush task's pending audit data
+ */
+void tty_audit_push_task(struct task_struct *tsk, uid_t loginuid)
+{
+	struct tty_audit_buf *buf;
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	buf = tsk->signal->tty_audit_buf;
+	if (buf)
+		atomic_inc(&buf->count);
+	spin_unlock_irq(&tsk->sighand->siglock);
+	if (!buf)
+		return;
+
+	mutex_lock(&buf->mutex);
+	tty_audit_buf_push(tsk, loginuid, buf);
+	mutex_unlock(&buf->mutex);
+
+	tty_audit_buf_put(buf);
+}
+
+/**
+ *	tty_audit_buf_get	-	Get an audit buffer.
+ *
+ *	Get an audit buffer for @tty, allocate it if necessary.  Return %NULL
+ *	if TTY auditing is disabled or out of memory.  Otherwise, return a new
+ *	reference to the buffer.
+ */
+static struct tty_audit_buf *tty_audit_buf_get(struct tty_struct *tty)
+{
+	struct tty_audit_buf *buf, *buf2;
+
+	buf = NULL;
+	buf2 = NULL;
+	spin_lock_irq(&current->sighand->siglock);
+	if (likely(!current->signal->audit_tty))
+		goto out;
+	buf = current->signal->tty_audit_buf;
+	if (buf) {
+		atomic_inc(&buf->count);
+		goto out;
+	}
+	spin_unlock_irq(&current->sighand->siglock);
+
+	buf2 = tty_audit_buf_alloc(tty->driver->major,
+				   tty->driver->minor_start + tty->index,
+				   tty->icanon);
+	if (buf2 == NULL) {
+		audit_log_lost("out of memory in TTY auditing");
+		return NULL;
+	}
+
+	spin_lock_irq(&current->sighand->siglock);
+	if (!current->signal->audit_tty)
+		goto out;
+	buf = current->signal->tty_audit_buf;
+	if (!buf) {
+		current->signal->tty_audit_buf = buf2;
+		buf = buf2;
+		buf2 = NULL;
+	}
+	atomic_inc(&buf->count);
+	/* Fall through */
+ out:
+	spin_unlock_irq(&current->sighand->siglock);
+	if (buf2)
+		tty_audit_buf_free(buf2);
+	return buf;
+}
+
+/**
+ *	tty_audit_add_data	-	Add data for TTY auditing.
+ *
+ *	Audit @data of @size from @tty, if necessary.
+ */
+void tty_audit_add_data(struct tty_struct *tty, unsigned char *data,
+			size_t size)
+{
+	struct tty_audit_buf *buf;
+	int major, minor;
+
+	if (unlikely(size == 0))
+		return;
+
+	buf = tty_audit_buf_get(tty);
+	if (!buf)
+		return;
+
+	mutex_lock(&buf->mutex);
+	major = tty->driver->major;
+	minor = tty->driver->minor_start + tty->index;
+	if (buf->major != major || buf->minor != minor
+	    || buf->icanon != tty->icanon) {
+		tty_audit_buf_push_current(buf);
+		buf->major = major;
+		buf->minor = minor;
+		buf->icanon = tty->icanon;
+	}
+	do {
+		size_t run;
+
+		run = N_TTY_BUF_SIZE - buf->valid;
+		if (run > size)
+			run = size;
+		memcpy(buf->data + buf->valid, data, run);
+		buf->valid += run;
+		data += run;
+		size -= run;
+		if (buf->valid == N_TTY_BUF_SIZE)
+			tty_audit_buf_push_current(buf);
+	} while (size != 0);
+	mutex_unlock(&buf->mutex);
+	tty_audit_buf_put(buf);
+}
+
+/**
+ *	tty_audit_push	-	Push buffered data out
+ *
+ *	Make sure no audit data is pending for @tty on the current process.
+ */
+void tty_audit_push(struct tty_struct *tty)
+{
+	struct tty_audit_buf *buf;
+
+	spin_lock_irq(&current->sighand->siglock);
+	if (likely(!current->signal->audit_tty)) {
+		spin_unlock_irq(&current->sighand->siglock);
+		return;
+	}
+	buf = current->signal->tty_audit_buf;
+	if (buf)
+		atomic_inc(&buf->count);
+	spin_unlock_irq(&current->sighand->siglock);
+
+	if (buf) {
+		int major, minor;
+
+		major = tty->driver->major;
+		minor = tty->driver->minor_start + tty->index;
+		mutex_lock(&buf->mutex);
+		if (buf->major == major && buf->minor == minor)
+			tty_audit_buf_push_current(buf);
+		mutex_unlock(&buf->mutex);
+		tty_audit_buf_put(buf);
+	}
+}
+
+/**
+ *	tty_audit_opening	-	A TTY is being opened.
+ *
+ *	As a special hack, tasks that close all their TTYs and open new ones
+ *	are assumed to be system daemons (e.g. getty) and auditing is
+ *	automatically disabled for them.
+ */
+void tty_audit_opening(void)
+{
+	int disable;
+
+	disable = 1;
+	spin_lock_irq(&current->sighand->siglock);
+	if (current->signal->audit_tty == 0)
+		disable = 0;
+	spin_unlock_irq(&current->sighand->siglock);
+	if (!disable)
+		return;
+
+	task_lock(current);
+	if (current->files) {
+		struct fdtable *fdt;
+		unsigned i;
+
+		/*
+		 * We don't take a ref to the file, so we must hold ->file_lock
+		 * instead.
+		 */
+		spin_lock(&current->files->file_lock);
+		fdt = files_fdtable(current->files);
+		for (i = 0; i < fdt->max_fds; i++) {
+			struct file *filp;
+
+			filp = fcheck_files(current->files, i);
+			if (filp && is_tty(filp)) {
+				disable = 0;
+				break;
+			}
+		}
+		spin_unlock(&current->files->file_lock);
+	}
+	task_unlock(current);
+	if (!disable)
+		return;
+
+	spin_lock_irq(&current->sighand->siglock);
+	current->signal->audit_tty = 0;
+	spin_unlock_irq(&current->sighand->siglock);
+}
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index fde69e589ca7..de37ebc3a4cf 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -1503,6 +1503,15 @@ int tty_hung_up_p(struct file * filp)
 
 EXPORT_SYMBOL(tty_hung_up_p);
 
+/**
+ * is_tty	-	checker whether file is a TTY
+ */
+int is_tty(struct file *filp)
+{
+	return filp->f_op->read == tty_read
+		|| filp->f_op->read == hung_up_tty_read;
+}
+
 static void session_clear_tty(struct pid *session)
 {
 	struct task_struct *p;
@@ -2673,6 +2682,7 @@ got_driver:
 		__proc_set_tty(current, tty);
 	spin_unlock_irq(&current->sighand->siglock);
 	mutex_unlock(&tty_mutex);
+	tty_audit_opening();
 	return 0;
 }
 
@@ -2735,8 +2745,10 @@ static int ptmx_open(struct inode * inode, struct file * filp)
 
 	check_tty_count(tty, "tty_open");
 	retval = ptm_driver->open(tty, filp);
-	if (!retval)
+	if (!retval) {
+		tty_audit_opening();
 		return 0;
+	}
 out1:
 	release_dev(filp);
 	return retval;
diff --git a/include/linux/audit.h b/include/linux/audit.h
index fccc6e50298a..8ca7ca0b47f0 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -63,9 +63,12 @@
 #define AUDIT_ADD_RULE		1011	/* Add syscall filtering rule */
 #define AUDIT_DEL_RULE		1012	/* Delete syscall filtering rule */
 #define AUDIT_LIST_RULES	1013	/* List syscall filtering rules */
+#define AUDIT_TTY_GET		1014	/* Get TTY auditing status */
+#define AUDIT_TTY_SET		1015	/* Set TTY auditing status */
 
 #define AUDIT_FIRST_USER_MSG	1100	/* Userspace messages mostly uninteresting to kernel */
 #define AUDIT_USER_AVC		1107	/* We filter this differently */
+#define AUDIT_USER_TTY		1124	/* Non-ICANON TTY input meaning */
 #define AUDIT_LAST_USER_MSG	1199
 #define AUDIT_FIRST_USER_MSG2	2100	/* More user space messages */
 #define AUDIT_LAST_USER_MSG2	2999
@@ -92,6 +95,7 @@
 #define AUDIT_KERNEL_OTHER	1316	/* For use by 3rd party modules */
 #define AUDIT_FD_PAIR		1317    /* audit record for pipe/socketpair */
 #define AUDIT_OBJ_PID		1318	/* ptrace target */
+#define AUDIT_TTY		1319	/* Input on an administrative TTY */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
@@ -289,6 +293,10 @@ struct audit_status {
 	__u32		backlog;	/* messages waiting in queue */
 };
 
+struct audit_tty_status {
+	__u32		enabled; /* 1 = enabled, 0 = disabled */
+};
+
 /* audit_rule_data supports filter rules with both integer and string
  * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
  * AUDIT_LIST_RULES requests.
@@ -515,11 +523,13 @@ extern void		    audit_log_d_path(struct audit_buffer *ab,
 					     const char *prefix,
 					     struct dentry *dentry,
 					     struct vfsmount *vfsmnt);
+extern void		    audit_log_lost(const char *message);
 				/* Private API (for audit.c only) */
 extern int audit_filter_user(struct netlink_skb_parms *cb, int type);
 extern int audit_filter_type(int type);
 extern int  audit_receive_filter(int type, int pid, int uid, int seq,
 			 void *data, size_t datasz, uid_t loginuid, u32 sid);
+extern int audit_enabled;
 #else
 #define audit_log(c,g,t,f,...) do { ; } while (0)
 #define audit_log_start(c,g,t) ({ NULL; })
@@ -530,6 +540,7 @@ extern int  audit_receive_filter(int type, int pid, int uid, int seq,
 #define audit_log_untrustedstring(a,s) do { ; } while (0)
 #define audit_log_n_untrustedstring(a,n,s) do { ; } while (0)
 #define audit_log_d_path(b,p,d,v) do { ; } while (0)
+#define audit_enabled 0
 #endif
 #endif
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3cffc1204663..b579624477f4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -529,6 +529,10 @@ struct signal_struct {
 #ifdef CONFIG_TASKSTATS
 	struct taskstats *stats;
 #endif
+#ifdef CONFIG_AUDIT
+	unsigned audit_tty;
+	struct tty_audit_buf *tty_audit_buf;
+#endif
 };
 
 /* Context switch must be unlocked if interrupts are to be enabled */
diff --git a/include/linux/tty.h b/include/linux/tty.h
index deaba9ec5004..691a1748d9d2 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -178,6 +178,7 @@ struct tty_bufhead {
 #define L_IEXTEN(tty)	_L_FLAG((tty),IEXTEN)
 
 struct device;
+struct signal_struct;
 /*
  * Where all of the state associated with a tty is kept while the tty
  * is open.  Since the termios state should be kept even if the tty
@@ -310,6 +311,7 @@ extern void tty_hangup(struct tty_struct * tty);
 extern void tty_vhangup(struct tty_struct * tty);
 extern void tty_unhangup(struct file *filp);
 extern int tty_hung_up_p(struct file * filp);
+extern int is_tty(struct file *filp);
 extern void do_SAK(struct tty_struct *tty);
 extern void __do_SAK(struct tty_struct *tty);
 extern void disassociate_ctty(int priv);
@@ -347,6 +349,37 @@ extern int tty_write_lock(struct tty_struct *tty, int ndelay);
 /* n_tty.c */
 extern struct tty_ldisc tty_ldisc_N_TTY;
 
+/* tty_audit.c */
+#ifdef CONFIG_AUDIT
+extern void tty_audit_add_data(struct tty_struct *tty, unsigned char *data,
+			       size_t size);
+extern void tty_audit_exit(void);
+extern void tty_audit_fork(struct signal_struct *sig);
+extern void tty_audit_push(struct tty_struct *tty);
+extern void tty_audit_push_task(struct task_struct *tsk, uid_t loginuid);
+extern void tty_audit_opening(void);
+#else
+static inline void tty_audit_add_data(struct tty_struct *tty,
+				      unsigned char *data, size_t size)
+{
+}
+static inline void tty_audit_exit(void)
+{
+}
+static inline void tty_audit_fork(struct signal_struct *sig)
+{
+}
+static inline void tty_audit_push(struct tty_struct *tty)
+{
+}
+static inline void tty_audit_push_task(struct task_struct *tsk, uid_t loginuid)
+{
+}
+static inline void tty_audit_opening(void)
+{
+}
+#endif
+
 /* tty_ioctl.c */
 extern int n_tty_ioctl(struct tty_struct * tty, struct file * file,
 		       unsigned int cmd, unsigned long arg);
diff --git a/kernel/audit.c b/kernel/audit.c
index d13276d41410..5ce8851facf7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -58,6 +58,7 @@
 #include <linux/selinux.h>
 #include <linux/inotify.h>
 #include <linux/freezer.h>
+#include <linux/tty.h>
 
 #include "audit.h"
 
@@ -423,6 +424,31 @@ static int kauditd_thread(void *dummy)
 	return 0;
 }
 
+static int audit_prepare_user_tty(pid_t pid, uid_t loginuid)
+{
+	struct task_struct *tsk;
+	int err;
+
+	read_lock(&tasklist_lock);
+	tsk = find_task_by_pid(pid);
+	err = -ESRCH;
+	if (!tsk)
+		goto out;
+	err = 0;
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	if (!tsk->signal->audit_tty)
+		err = -EPERM;
+	spin_unlock_irq(&tsk->sighand->siglock);
+	if (err)
+		goto out;
+
+	tty_audit_push_task(tsk, loginuid);
+out:
+	read_unlock(&tasklist_lock);
+	return err;
+}
+
 int audit_send_list(void *_dest)
 {
 	struct audit_netlink_list *dest = _dest;
@@ -511,6 +537,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 	case AUDIT_DEL:
 	case AUDIT_DEL_RULE:
 	case AUDIT_SIGNAL_INFO:
+	case AUDIT_TTY_GET:
+	case AUDIT_TTY_SET:
 		if (security_netlink_recv(skb, CAP_AUDIT_CONTROL))
 			err = -EPERM;
 		break;
@@ -622,6 +650,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		err = audit_filter_user(&NETLINK_CB(skb), msg_type);
 		if (err == 1) {
 			err = 0;
+			if (msg_type == AUDIT_USER_TTY) {
+				err = audit_prepare_user_tty(pid, loginuid);
+				if (err)
+					break;
+			}
 			ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
 			if (ab) {
 				audit_log_format(ab,
@@ -638,8 +671,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 							" subj=%s", ctx);
 					kfree(ctx);
 				}
-				audit_log_format(ab, " msg='%.1024s'",
-					 (char *)data);
+				if (msg_type != AUDIT_USER_TTY)
+					audit_log_format(ab, " msg='%.1024s'",
+							 (char *)data);
+				else {
+					int size;
+
+					audit_log_format(ab, " msg=");
+					size = nlmsg_len(nlh);
+					audit_log_n_untrustedstring(ab, size,
+								    data);
+				}
 				audit_set_pid(ab, pid);
 				audit_log_end(ab);
 			}
@@ -730,6 +772,45 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 				0, 0, sig_data, sizeof(*sig_data) + len);
 		kfree(sig_data);
 		break;
+	case AUDIT_TTY_GET: {
+		struct audit_tty_status s;
+		struct task_struct *tsk;
+
+		read_lock(&tasklist_lock);
+		tsk = find_task_by_pid(pid);
+		if (!tsk)
+			err = -ESRCH;
+		else {
+			spin_lock_irq(&tsk->sighand->siglock);
+			s.enabled = tsk->signal->audit_tty != 0;
+			spin_unlock_irq(&tsk->sighand->siglock);
+		}
+		read_unlock(&tasklist_lock);
+		audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0,
+				 &s, sizeof(s));
+		break;
+	}
+	case AUDIT_TTY_SET: {
+		struct audit_tty_status *s;
+		struct task_struct *tsk;
+
+		if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
+			return -EINVAL;
+		s = data;
+		if (s->enabled != 0 && s->enabled != 1)
+			return -EINVAL;
+		read_lock(&tasklist_lock);
+		tsk = find_task_by_pid(pid);
+		if (!tsk)
+			err = -ESRCH;
+		else {
+			spin_lock_irq(&tsk->sighand->siglock);
+			tsk->signal->audit_tty = s->enabled != 0;
+			spin_unlock_irq(&tsk->sighand->siglock);
+		}
+		read_unlock(&tasklist_lock);
+		break;
+	}
 	default:
 		err = -EINVAL;
 		break;
@@ -1185,7 +1266,7 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
 }
 
 /**
- * audit_log_n_unstrustedstring - log a string that may contain random characters
+ * audit_log_n_untrustedstring - log a string that may contain random characters
  * @ab: audit_buffer
  * @len: lenth of string (not including trailing null)
  * @string: string to be logged
@@ -1201,25 +1282,24 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
 const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
 					const char *string)
 {
-	const unsigned char *p = string;
+	const unsigned char *p;
 
-	while (*p) {
+	for (p = string; p < (const unsigned char *)string + len && *p; p++) {
 		if (*p == '"' || *p < 0x21 || *p > 0x7f) {
 			audit_log_hex(ab, string, len);
 			return string + len + 1;
 		}
-		p++;
 	}
 	audit_log_n_string(ab, len, string);
 	return p + 1;
 }
 
 /**
- * audit_log_unstrustedstring - log a string that may contain random characters
+ * audit_log_untrustedstring - log a string that may contain random characters
  * @ab: audit_buffer
  * @string: string to be logged
  *
- * Same as audit_log_n_unstrustedstring(), except that strlen is used to
+ * Same as audit_log_n_untrustedstring(), except that strlen is used to
  * determine string length.
  */
 const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
diff --git a/kernel/audit.h b/kernel/audit.h
index 815d6f5c04ee..95877435c347 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -115,7 +115,6 @@ extern struct sk_buff *	    audit_make_reply(int pid, int seq, int type,
 extern void		    audit_send_reply(int pid, int seq, int type,
 					     int done, int multi,
 					     void *payload, int size);
-extern void		    audit_log_lost(const char *message);
 extern void		    audit_panic(const char *message);
 
 struct audit_netlink_list {
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e36481ed61b4..7ccc3da30a91 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -71,9 +71,6 @@
 
 extern struct list_head audit_filter_list[];
 
-/* No syscall auditing will take place unless audit_enabled != 0. */
-extern int audit_enabled;
-
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
  * for saving names from getname(). */
 #define AUDIT_NAMES    20
diff --git a/kernel/exit.c b/kernel/exit.c
index 64a5263c8c7b..57626692cd90 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -965,6 +965,8 @@ fastcall NORET_TYPE void do_exit(long code)
 	if (unlikely(tsk->compat_robust_list))
 		compat_exit_robust_list(tsk);
 #endif
+	if (group_dead)
+		tty_audit_exit();
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 344d693fdc78..4015912aaac2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -49,6 +49,7 @@
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
+#include <linux/tty.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -897,6 +898,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	}
 	acct_init_pacct(&sig->pacct);
 
+	tty_audit_fork(sig);
+
 	return 0;
 }
 
diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c
index 42f12bd65964..89dcc485653b 100644
--- a/net/netlabel/netlabel_user.c
+++ b/net/netlabel/netlabel_user.c
@@ -46,10 +46,6 @@
 #include "netlabel_cipso_v4.h"
 #include "netlabel_user.h"
 
-/* do not do any auditing if audit_enabled == 0, see kernel/audit.c for
- * details */
-extern int audit_enabled;
-
 /*
  * NetLabel NETLINK Setup Functions
  */
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index ccfe8755735e..eddc7b420109 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -110,6 +110,8 @@ static struct nlmsg_perm nlmsg_audit_perms[] =
 	{ AUDIT_DEL_RULE,	NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
 	{ AUDIT_USER,		NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
 	{ AUDIT_SIGNAL_INFO,	NETLINK_AUDIT_SOCKET__NLMSG_READ     },
+	{ AUDIT_TTY_GET,	NETLINK_AUDIT_SOCKET__NLMSG_READ     },
+	{ AUDIT_TTY_SET,	NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
 };
 
 
-- 
cgit v1.3-8-gc7d7


From 7d69a1f4a72b18876c99c697692b78339d491568 Mon Sep 17 00:00:00 2001
From: Cedric Le Goater <clg@fr.ibm.com>
Date: Sun, 15 Jul 2007 23:40:58 -0700
Subject: remove CONFIG_UTS_NS and CONFIG_IPC_NS

CONFIG_UTS_NS and CONFIG_IPC_NS have very little value as they only
deactivate the unshare of the uts and ipc namespaces and do not improve
performance.

Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Acked-by: "Serge E. Hallyn" <serue@us.ibm.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Pavel Emelianov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc.h     | 11 +++--------
 include/linux/utsname.h | 13 -------------
 init/Kconfig            | 17 -----------------
 ipc/msg.c               |  4 +---
 ipc/sem.c               |  4 +---
 ipc/shm.c               |  4 +---
 ipc/util.c              | 11 +----------
 ipc/util.h              |  8 ++------
 kernel/Makefile         |  4 ++--
 kernel/nsproxy.c        | 10 ----------
 kernel/utsname_sysctl.c |  5 +----
 11 files changed, 12 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index 7c8c6d8d090c..3fd3ddd5f0d9 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -93,6 +93,7 @@ extern struct ipc_namespace init_ipc_ns;
 
 #ifdef CONFIG_SYSVIPC
 #define INIT_IPC_NS(ns)		.ns		= &init_ipc_ns,
+extern void free_ipc_ns(struct kref *kref);
 extern struct ipc_namespace *copy_ipcs(unsigned long flags,
 						struct ipc_namespace *ns);
 #else
@@ -104,13 +105,9 @@ static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
 }
 #endif
 
-#ifdef CONFIG_IPC_NS
-extern void free_ipc_ns(struct kref *kref);
-#endif
-
 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 {
-#ifdef CONFIG_IPC_NS
+#ifdef CONFIG_SYSVIPC
 	if (ns)
 		kref_get(&ns->kref);
 #endif
@@ -119,7 +116,7 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 
 static inline void put_ipc_ns(struct ipc_namespace *ns)
 {
-#ifdef CONFIG_IPC_NS
+#ifdef CONFIG_SYSVIPC
 	kref_put(&ns->kref, free_ipc_ns);
 #endif
 }
@@ -127,5 +124,3 @@ static inline void put_ipc_ns(struct ipc_namespace *ns)
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_IPC_H */
-
-
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index f8d3b326e93a..51ad167611e4 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -48,7 +48,6 @@ static inline void get_uts_ns(struct uts_namespace *ns)
 	kref_get(&ns->kref);
 }
 
-#ifdef CONFIG_UTS_NS
 extern struct uts_namespace *copy_utsname(int flags, struct uts_namespace *ns);
 extern void free_uts_ns(struct kref *kref);
 
@@ -56,18 +55,6 @@ static inline void put_uts_ns(struct uts_namespace *ns)
 {
 	kref_put(&ns->kref, free_uts_ns);
 }
-#else
-static inline struct uts_namespace *copy_utsname(int flags,
-						struct uts_namespace *ns)
-{
-	return ns;
-}
-
-static inline void put_uts_ns(struct uts_namespace *ns)
-{
-}
-#endif
-
 static inline struct new_utsname *utsname(void)
 {
 	return &current->nsproxy->uts_ns->name;
diff --git a/init/Kconfig b/init/Kconfig
index 73e91983760f..1e198b8c6936 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -120,15 +120,6 @@ config SYSVIPC
 	  section 6.4 of the Linux Programmer's Guide, available from
 	  <http://www.tldp.org/guides.html>.
 
-config IPC_NS
-	bool "IPC Namespaces"
-	depends on SYSVIPC
-	default n
-	help
-	  Support ipc namespaces.  This allows containers, i.e. virtual
-	  environments, to use ipc namespaces to provide different ipc
-	  objects for different servers.  If unsure, say N.
-
 config SYSVIPC_SYSCTL
 	bool
 	depends on SYSVIPC
@@ -218,14 +209,6 @@ config TASK_IO_ACCOUNTING
 
 	  Say N if unsure.
 
-config UTS_NS
-	bool "UTS Namespaces"
-	default n
-	help
-	  Support uts namespaces.  This allows containers, i.e.
-	  vservers, to use uts namespaces to provide different
-	  uts info for different servers.  If unsure, say N.
-
 config AUDIT
 	bool "Auditing support"
 	depends on NET
diff --git a/ipc/msg.c b/ipc/msg.c
index a388824740e7..cbd27e519943 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -87,7 +87,7 @@ static int newque (struct ipc_namespace *ns, key_t key, int msgflg);
 static int sysvipc_msg_proc_show(struct seq_file *s, void *it);
 #endif
 
-static void __ipc_init __msg_init_ns(struct ipc_namespace *ns, struct ipc_ids *ids)
+static void __msg_init_ns(struct ipc_namespace *ns, struct ipc_ids *ids)
 {
 	ns->ids[IPC_MSG_IDS] = ids;
 	ns->msg_ctlmax = MSGMAX;
@@ -96,7 +96,6 @@ static void __ipc_init __msg_init_ns(struct ipc_namespace *ns, struct ipc_ids *i
 	ipc_init_ids(ids, ns->msg_ctlmni);
 }
 
-#ifdef CONFIG_IPC_NS
 int msg_init_ns(struct ipc_namespace *ns)
 {
 	struct ipc_ids *ids;
@@ -128,7 +127,6 @@ void msg_exit_ns(struct ipc_namespace *ns)
 	kfree(ns->ids[IPC_MSG_IDS]);
 	ns->ids[IPC_MSG_IDS] = NULL;
 }
-#endif
 
 void __init msg_init(void)
 {
diff --git a/ipc/sem.c b/ipc/sem.c
index 9964b2224c70..89bfdffb38d8 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -121,7 +121,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
 #define sc_semopm	sem_ctls[2]
 #define sc_semmni	sem_ctls[3]
 
-static void __ipc_init __sem_init_ns(struct ipc_namespace *ns, struct ipc_ids *ids)
+static void __sem_init_ns(struct ipc_namespace *ns, struct ipc_ids *ids)
 {
 	ns->ids[IPC_SEM_IDS] = ids;
 	ns->sc_semmsl = SEMMSL;
@@ -132,7 +132,6 @@ static void __ipc_init __sem_init_ns(struct ipc_namespace *ns, struct ipc_ids *i
 	ipc_init_ids(ids, ns->sc_semmni);
 }
 
-#ifdef CONFIG_IPC_NS
 int sem_init_ns(struct ipc_namespace *ns)
 {
 	struct ipc_ids *ids;
@@ -164,7 +163,6 @@ void sem_exit_ns(struct ipc_namespace *ns)
 	kfree(ns->ids[IPC_SEM_IDS]);
 	ns->ids[IPC_SEM_IDS] = NULL;
 }
-#endif
 
 void __init sem_init (void)
 {
diff --git a/ipc/shm.c b/ipc/shm.c
index 0852f206d895..242c3f66493a 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -77,7 +77,7 @@ static void shm_destroy (struct ipc_namespace *ns, struct shmid_kernel *shp);
 static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
 #endif
 
-static void __ipc_init __shm_init_ns(struct ipc_namespace *ns, struct ipc_ids *ids)
+static void __shm_init_ns(struct ipc_namespace *ns, struct ipc_ids *ids)
 {
 	ns->ids[IPC_SHM_IDS] = ids;
 	ns->shm_ctlmax = SHMMAX;
@@ -98,7 +98,6 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *shp)
 		shm_destroy(ns, shp);
 }
 
-#ifdef CONFIG_IPC_NS
 int shm_init_ns(struct ipc_namespace *ns)
 {
 	struct ipc_ids *ids;
@@ -130,7 +129,6 @@ void shm_exit_ns(struct ipc_namespace *ns)
 	kfree(ns->ids[IPC_SHM_IDS]);
 	ns->ids[IPC_SHM_IDS] = NULL;
 }
-#endif
 
 void __init shm_init (void)
 {
diff --git a/ipc/util.c b/ipc/util.c
index 7536a7292d48..44e5135aee47 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -50,7 +50,6 @@ struct ipc_namespace init_ipc_ns = {
 	},
 };
 
-#ifdef CONFIG_IPC_NS
 static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns)
 {
 	int err;
@@ -110,14 +109,6 @@ void free_ipc_ns(struct kref *kref)
 	shm_exit_ns(ns);
 	kfree(ns);
 }
-#else
-struct ipc_namespace *copy_ipcs(unsigned long flags, struct ipc_namespace *ns)
-{
-	if (flags & CLONE_NEWIPC)
-		return ERR_PTR(-EINVAL);
-	return ns;
-}
-#endif
 
 /**
  *	ipc_init	-	initialise IPC subsystem
@@ -145,7 +136,7 @@ __initcall(ipc_init);
  *	array itself. 
  */
  
-void __ipc_init ipc_init_ids(struct ipc_ids* ids, int size)
+void ipc_init_ids(struct ipc_ids* ids, int size)
 {
 	int i;
 
diff --git a/ipc/util.h b/ipc/util.h
index e3aa2c5c97dc..333e891bcaca 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -41,12 +41,8 @@ struct ipc_ids {
 };
 
 struct seq_file;
-#ifdef CONFIG_IPC_NS
-#define __ipc_init
-#else
-#define __ipc_init	__init
-#endif
-void __ipc_init ipc_init_ids(struct ipc_ids *ids, int size);
+
+void ipc_init_ids(struct ipc_ids *ids, int size);
 #ifdef CONFIG_PROC_FS
 void __init ipc_init_proc_interface(const char *path, const char *header,
 		int ids, int (*show)(struct seq_file *, void *));
diff --git a/kernel/Makefile b/kernel/Makefile
index 642d4277c2ea..fa8efd437afb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,8 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
-	    hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o
+	    hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \
+	    utsname.o
 
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
@@ -48,7 +49,6 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
-obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 9e83b589f754..e38bed75367d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -155,16 +155,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
 		return 0;
 
-#ifndef CONFIG_IPC_NS
-	if (unshare_flags & CLONE_NEWIPC)
-		return -EINVAL;
-#endif
-
-#ifndef CONFIG_UTS_NS
-	if (unshare_flags & CLONE_NEWUTS)
-		return -EINVAL;
-#endif
-
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index f22b9dbd2a9c..c76c06466bfd 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -18,10 +18,7 @@
 static void *get_uts(ctl_table *table, int write)
 {
 	char *which = table->data;
-#ifdef CONFIG_UTS_NS
-	struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
-	which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
-#endif
+
 	if (!write)
 		down_read(&uts_sem);
 	else
-- 
cgit v1.3-8-gc7d7


From acce292c82d4d82d35553b928df2b0597c3a9c78 Mon Sep 17 00:00:00 2001
From: Cedric Le Goater <clg@fr.ibm.com>
Date: Sun, 15 Jul 2007 23:40:59 -0700
Subject: user namespace: add the framework

Basically, it will allow a process to unshare its user_struct table,
resetting at the same time its own user_struct and all the associated
accounting.

A new root user (uid == 0) is added to the user namespace upon creation.
Such root users have full privileges and it seems that theses privileges
should be controlled through some means (process capabilities ?)

The unshare is not included in this patch.

Changes since [try #4]:
	- Updated get_user_ns and put_user_ns to accept NULL, and
	  get_user_ns to return the namespace.

Changes since [try #3]:
	- moved struct user_namespace to files user_namespace.{c,h}

Changes since [try #2]:
	- removed struct user_namespace* argument from find_user()

Changes since [try #1]:
	- removed struct user_namespace* argument from find_user()
	- added a root_user per user namespace

Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: Pavel Emelianov <xemul@openvz.org>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Kirill Korotaev <dev@sw.ru>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: James Morris <jmorris@namei.org>
Cc: Andrew Morgan <agm@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/init_task.h      |  2 ++
 include/linux/nsproxy.h        |  1 +
 include/linux/sched.h          |  3 ++-
 include/linux/user_namespace.h | 57 ++++++++++++++++++++++++++++++++++++++++++
 init/Kconfig                   |  9 +++++++
 kernel/Makefile                |  2 +-
 kernel/fork.c                  |  2 +-
 kernel/nsproxy.c               |  9 +++++++
 kernel/sys.c                   |  5 ++--
 kernel/user.c                  | 18 ++++++-------
 kernel/user_namespace.c        | 43 +++++++++++++++++++++++++++++++
 11 files changed, 137 insertions(+), 14 deletions(-)
 create mode 100644 include/linux/user_namespace.h
 create mode 100644 kernel/user_namespace.c

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 276ccaa2670c..cab741c2d603 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -8,6 +8,7 @@
 #include <linux/lockdep.h>
 #include <linux/ipc.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 
 #define INIT_FDTABLE \
 {							\
@@ -78,6 +79,7 @@ extern struct nsproxy init_nsproxy;
 	.uts_ns		= &init_uts_ns,					\
 	.mnt_ns		= NULL,						\
 	INIT_IPC_NS(ipc_ns)						\
+	.user_ns	= &init_user_ns,				\
 }
 
 #define INIT_SIGHAND(sighand) {						\
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 189e0dc993ab..6d179a397bfb 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -28,6 +28,7 @@ struct nsproxy {
 	struct ipc_namespace *ipc_ns;
 	struct mnt_namespace *mnt_ns;
 	struct pid_namespace *pid_ns;
+	struct user_namespace *user_ns;
 };
 extern struct nsproxy init_nsproxy;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b579624477f4..c667255d70db 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -287,6 +287,7 @@ extern signed long schedule_timeout_uninterruptible(signed long timeout);
 asmlinkage void schedule(void);
 
 struct nsproxy;
+struct user_namespace;
 
 /* Maximum number of active map areas.. This is a random (large) number */
 #define DEFAULT_MAX_MAP_COUNT	65536
@@ -1408,7 +1409,7 @@ extern struct task_struct *find_task_by_pid_type(int type, int pid);
 extern void __set_special_pids(pid_t session, pid_t pgrp);
 
 /* per-UID process charging. */
-extern struct user_struct * alloc_uid(uid_t);
+extern struct user_struct * alloc_uid(struct user_namespace *, uid_t);
 static inline struct user_struct *get_uid(struct user_struct *u)
 {
 	atomic_inc(&u->__count);
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
new file mode 100644
index 000000000000..92a45867ecfb
--- /dev/null
+++ b/include/linux/user_namespace.h
@@ -0,0 +1,57 @@
+#ifndef _LINUX_USER_NAMESPACE_H
+#define _LINUX_USER_NAMESPACE_H
+
+#include <linux/kref.h>
+#include <linux/nsproxy.h>
+#include <linux/sched.h>
+
+#define UIDHASH_BITS	(CONFIG_BASE_SMALL ? 3 : 8)
+#define UIDHASH_SZ	(1 << UIDHASH_BITS)
+
+struct user_namespace {
+	struct kref		kref;
+	struct list_head	uidhash_table[UIDHASH_SZ];
+	struct user_struct	*root_user;
+};
+
+extern struct user_namespace init_user_ns;
+
+#ifdef CONFIG_USER_NS
+
+static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
+{
+	if (ns)
+		kref_get(&ns->kref);
+	return ns;
+}
+
+extern struct user_namespace *copy_user_ns(int flags,
+					   struct user_namespace *old_ns);
+extern void free_user_ns(struct kref *kref);
+
+static inline void put_user_ns(struct user_namespace *ns)
+{
+	if (ns)
+		kref_put(&ns->kref, free_user_ns);
+}
+
+#else
+
+static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
+{
+	return &init_user_ns;
+}
+
+static inline struct user_namespace *copy_user_ns(int flags,
+						  struct user_namespace *old_ns)
+{
+	return NULL;
+}
+
+static inline void put_user_ns(struct user_namespace *ns)
+{
+}
+
+#endif
+
+#endif /* _LINUX_USER_H */
diff --git a/init/Kconfig b/init/Kconfig
index 1e198b8c6936..0b0e29ed82d1 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -209,6 +209,15 @@ config TASK_IO_ACCOUNTING
 
 	  Say N if unsure.
 
+config USER_NS
+	bool "User Namespaces (EXPERIMENTAL)"
+	default n
+	depends on EXPERIMENTAL
+	help
+	  Support user namespaces.  This allows containers, i.e.
+	  vservers, to use user namespaces to provide different
+	  user info for different servers.  If unsure, say N.
+
 config AUDIT
 	bool "Auditing support"
 	depends on NET
diff --git a/kernel/Makefile b/kernel/Makefile
index fa8efd437afb..2a999836ca18 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -4,7 +4,7 @@
 
 obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    exit.o itimer.o time.o softirq.o resource.o \
-	    sysctl.o capability.o ptrace.o timer.o user.o \
+	    sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
diff --git a/kernel/fork.c b/kernel/fork.c
index 4015912aaac2..13cf0978780a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1002,7 +1002,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (atomic_read(&p->user->processes) >=
 			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
 		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
-				p->user != &root_user)
+		    p->user != current->nsproxy->user_ns->root_user)
 			goto bad_fork_free;
 	}
 
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index e38bed75367d..895e3a3f2044 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -79,8 +79,15 @@ static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk,
 	if (IS_ERR(new_nsp->pid_ns))
 		goto out_pid;
 
+	new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns);
+	if (IS_ERR(new_nsp->user_ns))
+		goto out_user;
+
 	return new_nsp;
 
+out_user:
+	if (new_nsp->pid_ns)
+		put_pid_ns(new_nsp->pid_ns);
 out_pid:
 	if (new_nsp->ipc_ns)
 		put_ipc_ns(new_nsp->ipc_ns);
@@ -140,6 +147,8 @@ void free_nsproxy(struct nsproxy *ns)
 		put_ipc_ns(ns->ipc_ns);
 	if (ns->pid_ns)
 		put_pid_ns(ns->pid_ns);
+	if (ns->user_ns)
+		put_user_ns(ns->user_ns);
 	kfree(ns);
 }
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 872271ccc384..ed92e2f03342 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -35,6 +35,7 @@
 #include <linux/compat.h>
 #include <linux/syscalls.h>
 #include <linux/kprobes.h>
+#include <linux/user_namespace.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -1078,13 +1079,13 @@ static int set_user(uid_t new_ruid, int dumpclear)
 {
 	struct user_struct *new_user;
 
-	new_user = alloc_uid(new_ruid);
+	new_user = alloc_uid(current->nsproxy->user_ns, new_ruid);
 	if (!new_user)
 		return -EAGAIN;
 
 	if (atomic_read(&new_user->processes) >=
 				current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
-			new_user != &root_user) {
+			new_user != current->nsproxy->user_ns->root_user) {
 		free_uid(new_user);
 		return -EAGAIN;
 	}
diff --git a/kernel/user.c b/kernel/user.c
index 4869563080e9..98b82507797a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -14,20 +14,19 @@
 #include <linux/bitops.h>
 #include <linux/key.h>
 #include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/user_namespace.h>
 
 /*
  * UID task count cache, to get fast user lookup in "alloc_uid"
  * when changing user ID's (ie setuid() and friends).
  */
 
-#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8)
-#define UIDHASH_SZ		(1 << UIDHASH_BITS)
 #define UIDHASH_MASK		(UIDHASH_SZ - 1)
 #define __uidhashfn(uid)	(((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
-#define uidhashentry(uid)	(uidhash_table + __uidhashfn((uid)))
+#define uidhashentry(ns, uid)	((ns)->uidhash_table + __uidhashfn((uid)))
 
 static struct kmem_cache *uid_cachep;
-static struct list_head uidhash_table[UIDHASH_SZ];
 
 /*
  * The uidhash_lock is mostly taken from process context, but it is
@@ -94,9 +93,10 @@ struct user_struct *find_user(uid_t uid)
 {
 	struct user_struct *ret;
 	unsigned long flags;
+	struct user_namespace *ns = current->nsproxy->user_ns;
 
 	spin_lock_irqsave(&uidhash_lock, flags);
-	ret = uid_hash_find(uid, uidhashentry(uid));
+	ret = uid_hash_find(uid, uidhashentry(ns, uid));
 	spin_unlock_irqrestore(&uidhash_lock, flags);
 	return ret;
 }
@@ -120,9 +120,9 @@ void free_uid(struct user_struct *up)
 	}
 }
 
-struct user_struct * alloc_uid(uid_t uid)
+struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 {
-	struct list_head *hashent = uidhashentry(uid);
+	struct list_head *hashent = uidhashentry(ns, uid);
 	struct user_struct *up;
 
 	spin_lock_irq(&uidhash_lock);
@@ -211,11 +211,11 @@ static int __init uid_cache_init(void)
 			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 
 	for(n = 0; n < UIDHASH_SZ; ++n)
-		INIT_LIST_HEAD(uidhash_table + n);
+		INIT_LIST_HEAD(init_user_ns.uidhash_table + n);
 
 	/* Insert the root user immediately (init already runs as root) */
 	spin_lock_irq(&uidhash_lock);
-	uid_hash_insert(&root_user, uidhashentry(0));
+	uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0));
 	spin_unlock_irq(&uidhash_lock);
 
 	return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
new file mode 100644
index 000000000000..3d7964209774
--- /dev/null
+++ b/kernel/user_namespace.c
@@ -0,0 +1,43 @@
+/*
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/nsproxy.h>
+#include <linux/user_namespace.h>
+
+struct user_namespace init_user_ns = {
+	.kref = {
+		.refcount	= ATOMIC_INIT(2),
+	},
+	.root_user = &root_user,
+};
+
+EXPORT_SYMBOL_GPL(init_user_ns);
+
+#ifdef CONFIG_USER_NS
+
+struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns)
+{
+	struct user_namespace *new_ns;
+
+	BUG_ON(!old_ns);
+	get_user_ns(old_ns);
+
+	new_ns = old_ns;
+	return new_ns;
+}
+
+void free_user_ns(struct kref *kref)
+{
+	struct user_namespace *ns;
+
+	ns = container_of(kref, struct user_namespace, kref);
+	kfree(ns);
+}
+
+#endif /* CONFIG_USER_NS */
-- 
cgit v1.3-8-gc7d7


From 77ec739d8d0979477fc91f530403805afa2581a4 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Sun, 15 Jul 2007 23:41:01 -0700
Subject: user namespace: add unshare

This patch enables the unshare of user namespaces.

It adds a new clone flag CLONE_NEWUSER and implements copy_user_ns() which
resets the current user_struct and adds a new root user (uid == 0)

For now, unsharing the user namespace allows a process to reset its
user_struct accounting and uid 0 in the new user namespace should be contained
using appropriate means, for instance selinux

The plan, when the full support is complete (all uid checks covered), is to
keep the original user's rights in the original namespace, and let a process
become uid 0 in the new namespace, with full capabilities to the new
namespace.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Acked-by: Pavel Emelianov <xemul@openvz.org>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Kirill Korotaev <dev@sw.ru>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: James Morris <jmorris@namei.org>
Cc: Andrew Morgan <agm@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h          |  1 +
 include/linux/user_namespace.h |  4 ++++
 kernel/fork.c                  |  2 +-
 kernel/nsproxy.c               |  5 +++--
 kernel/user_namespace.c        | 46 +++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 54 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c667255d70db..731edaca8ffd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -26,6 +26,7 @@
 #define CLONE_STOPPED		0x02000000	/* Start in stopped state */
 #define CLONE_NEWUTS		0x04000000	/* New utsname group? */
 #define CLONE_NEWIPC		0x08000000	/* New ipcs */
+#define CLONE_NEWUSER		0x10000000	/* New user namespace */
 
 /*
  * Scheduling policies
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 92a45867ecfb..bb320573bb9e 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -4,6 +4,7 @@
 #include <linux/kref.h>
 #include <linux/nsproxy.h>
 #include <linux/sched.h>
+#include <linux/err.h>
 
 #define UIDHASH_BITS	(CONFIG_BASE_SMALL ? 3 : 8)
 #define UIDHASH_SZ	(1 << UIDHASH_BITS)
@@ -45,6 +46,9 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 static inline struct user_namespace *copy_user_ns(int flags,
 						  struct user_namespace *old_ns)
 {
+	if (flags & CLONE_NEWUSER)
+		return ERR_PTR(-EINVAL);
+
 	return NULL;
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 13cf0978780a..7c5c5888e00a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1606,7 +1606,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 	err = -EINVAL;
 	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
 				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
-				CLONE_NEWUTS|CLONE_NEWIPC))
+				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER))
 		goto bad_unshare_out;
 
 	if ((err = unshare_thread(unshare_flags)))
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 895e3a3f2044..5aa28e219487 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -117,7 +117,7 @@ int copy_namespaces(int flags, struct task_struct *tsk)
 
 	get_nsproxy(old_ns);
 
-	if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
+	if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER)))
 		return 0;
 
 	if (!capable(CAP_SYS_ADMIN)) {
@@ -161,7 +161,8 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 {
 	int err = 0;
 
-	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
+	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+			       CLONE_NEWUSER)))
 		return 0;
 
 	if (!capable(CAP_SYS_ADMIN))
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 3d7964209774..89a27e8b17fb 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -21,6 +21,45 @@ EXPORT_SYMBOL_GPL(init_user_ns);
 
 #ifdef CONFIG_USER_NS
 
+/*
+ * Clone a new ns copying an original user ns, setting refcount to 1
+ * @old_ns: namespace to clone
+ * Return NULL on error (failure to kmalloc), new ns otherwise
+ */
+static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
+{
+	struct user_namespace *ns;
+	struct user_struct *new_user;
+	int n;
+
+	ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
+	if (!ns)
+		return NULL;
+
+	kref_init(&ns->kref);
+
+	for (n = 0; n < UIDHASH_SZ; ++n)
+		INIT_LIST_HEAD(ns->uidhash_table + n);
+
+	/* Insert new root user.  */
+	ns->root_user = alloc_uid(ns, 0);
+	if (!ns->root_user) {
+		kfree(ns);
+		return NULL;
+	}
+
+	/* Reset current->user with a new one */
+	new_user = alloc_uid(ns, current->uid);
+	if (!new_user) {
+		free_uid(ns->root_user);
+		kfree(ns);
+		return NULL;
+	}
+
+	switch_uid(new_user);
+	return ns;
+}
+
 struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns)
 {
 	struct user_namespace *new_ns;
@@ -28,7 +67,12 @@ struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns)
 	BUG_ON(!old_ns);
 	get_user_ns(old_ns);
 
-	new_ns = old_ns;
+	if (!(flags & CLONE_NEWUSER))
+		return old_ns;
+
+	new_ns = clone_user_ns(old_ns);
+
+	put_user_ns(old_ns);
 	return new_ns;
 }
 
-- 
cgit v1.3-8-gc7d7


From ea5a3dcfda1c9140228f2842ea9b01e1713c559a Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@mindspring.com>
Date: Sun, 15 Jul 2007 23:41:04 -0700
Subject: COBALT: remove all references to Cobalt NVRAM

Remove not only the references to Cobalt NVRAM, but the header file as
well.

Signed-off-by: Robert P. J. Day <rpjday@mindspring.com>
Acked-by: Tim Hockin <thockin@hockin.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/nvram.c         | 192 +------------------------------------------
 include/linux/cobalt-nvram.h | 109 ------------------------
 2 files changed, 1 insertion(+), 300 deletions(-)
 delete mode 100644 include/linux/cobalt-nvram.h

(limited to 'include/linux')

diff --git a/drivers/char/nvram.c b/drivers/char/nvram.c
index 204deaa0de80..98dec380af49 100644
--- a/drivers/char/nvram.c
+++ b/drivers/char/nvram.c
@@ -42,19 +42,12 @@
 
 #define PC		1
 #define ATARI		2
-#define COBALT		3
 
 /* select machine configuration */
 #if defined(CONFIG_ATARI)
 #  define MACH ATARI
 #elif defined(__i386__) || defined(__x86_64__) || defined(__arm__)  /* and others?? */
-#define MACH PC
-#  if defined(CONFIG_COBALT)
-#    include <linux/cobalt-nvram.h>
-#    define MACH COBALT
-#  else
-#    define MACH PC
-#  endif
+#  define MACH PC
 #else
 #  error Cannot build nvram driver for this machine configuration.
 #endif
@@ -76,18 +69,6 @@
 
 #endif
 
-#if MACH == COBALT
-
-#define CHECK_DRIVER_INIT()     1
-
-#define NVRAM_BYTES		(128-NVRAM_FIRST_BYTE)
-
-#define mach_check_checksum	cobalt_check_checksum
-#define mach_set_checksum	cobalt_set_checksum
-#define mach_proc_infos		cobalt_proc_infos
-
-#endif
-
 #if MACH == ATARI
 
 /* Special parameters for RTC in Atari machines */
@@ -604,177 +585,6 @@ pc_proc_infos(unsigned char *nvram, char *buffer, int *len,
 
 #endif /* MACH == PC */
 
-#if MACH == COBALT
-
-/* the cobalt CMOS has a wider range of its checksum */
-static int cobalt_check_checksum(void)
-{
-	int i;
-	unsigned short sum = 0;
-	unsigned short expect;
-
-	for (i = COBT_CMOS_CKS_START; i <= COBT_CMOS_CKS_END; ++i) {
-		if ((i == COBT_CMOS_CHECKSUM) || (i == (COBT_CMOS_CHECKSUM+1)))
-			continue;
-
-		sum += __nvram_read_byte(i);
-	}
-	expect = __nvram_read_byte(COBT_CMOS_CHECKSUM) << 8 |
-	    __nvram_read_byte(COBT_CMOS_CHECKSUM+1);
-	return ((sum & 0xffff) == expect);
-}
-
-static void cobalt_set_checksum(void)
-{
-	int i;
-	unsigned short sum = 0;
-
-	for (i = COBT_CMOS_CKS_START; i <= COBT_CMOS_CKS_END; ++i) {
-		if ((i == COBT_CMOS_CHECKSUM) || (i == (COBT_CMOS_CHECKSUM+1)))
-			continue;
-
-		sum += __nvram_read_byte(i);
-	}
-
-	__nvram_write_byte(sum >> 8, COBT_CMOS_CHECKSUM);
-	__nvram_write_byte(sum & 0xff, COBT_CMOS_CHECKSUM+1);
-}
-
-#ifdef CONFIG_PROC_FS
-
-static int cobalt_proc_infos(unsigned char *nvram, char *buffer, int *len,
-	off_t *begin, off_t offset, int size)
-{
-	int i;
-	unsigned int checksum;
-	unsigned int flags;
-	char sernum[14];
-	char *key = "cNoEbTaWlOtR!";
-	unsigned char bto_csum;
-
-	spin_lock_irq(&rtc_lock);
-	checksum = __nvram_check_checksum();
-	spin_unlock_irq(&rtc_lock);
-
-	PRINT_PROC("Checksum status: %svalid\n", checksum ? "" : "not ");
-
-	flags = nvram[COBT_CMOS_FLAG_BYTE_0] << 8 
-	    | nvram[COBT_CMOS_FLAG_BYTE_1];
-
-	PRINT_PROC("Console: %s\n",
-		flags & COBT_CMOS_CONSOLE_FLAG ?  "on": "off");
-
-	PRINT_PROC("Firmware Debug Messages: %s\n",
-		flags & COBT_CMOS_DEBUG_FLAG ? "on": "off");
-
-	PRINT_PROC("Auto Prompt: %s\n",
-		flags & COBT_CMOS_AUTO_PROMPT_FLAG ? "on": "off");
-
-	PRINT_PROC("Shutdown Status: %s\n",
-		flags & COBT_CMOS_CLEAN_BOOT_FLAG ? "clean": "dirty");
-
-	PRINT_PROC("Hardware Probe: %s\n",
-		flags & COBT_CMOS_HW_NOPROBE_FLAG ? "partial": "full");
-
-	PRINT_PROC("System Fault: %sdetected\n",
-		flags & COBT_CMOS_SYSFAULT_FLAG ? "": "not ");
-
-	PRINT_PROC("Panic on OOPS: %s\n",
-		flags & COBT_CMOS_OOPSPANIC_FLAG ? "yes": "no");
-
-	PRINT_PROC("Delayed Cache Initialization: %s\n",
-		flags & COBT_CMOS_DELAY_CACHE_FLAG ? "yes": "no");
-
-	PRINT_PROC("Show Logo at Boot: %s\n",
-		flags & COBT_CMOS_NOLOGO_FLAG ? "no": "yes");
-
-	PRINT_PROC("Boot Method: ");
-	switch (nvram[COBT_CMOS_BOOT_METHOD]) {
-	case COBT_CMOS_BOOT_METHOD_DISK:
-		PRINT_PROC("disk\n");
-		break;
-
-	case COBT_CMOS_BOOT_METHOD_ROM:
-		PRINT_PROC("rom\n");
-		break;
-
-	case COBT_CMOS_BOOT_METHOD_NET:
-		PRINT_PROC("net\n");
-		break;
-
-	default:
-		PRINT_PROC("unknown\n");
-		break;
-	}
-
-	PRINT_PROC("Primary Boot Device: %d:%d\n",
-		nvram[COBT_CMOS_BOOT_DEV0_MAJ],
-		nvram[COBT_CMOS_BOOT_DEV0_MIN] );
-	PRINT_PROC("Secondary Boot Device: %d:%d\n",
-		nvram[COBT_CMOS_BOOT_DEV1_MAJ],
-		nvram[COBT_CMOS_BOOT_DEV1_MIN] );
-	PRINT_PROC("Tertiary Boot Device: %d:%d\n",
-		nvram[COBT_CMOS_BOOT_DEV2_MAJ],
-		nvram[COBT_CMOS_BOOT_DEV2_MIN] );
-
-	PRINT_PROC("Uptime: %d\n",
-		nvram[COBT_CMOS_UPTIME_0] << 24 |
-		nvram[COBT_CMOS_UPTIME_1] << 16 |
-		nvram[COBT_CMOS_UPTIME_2] << 8  |
-		nvram[COBT_CMOS_UPTIME_3]);
-
-	PRINT_PROC("Boot Count: %d\n",
-		nvram[COBT_CMOS_BOOTCOUNT_0] << 24 |
-		nvram[COBT_CMOS_BOOTCOUNT_1] << 16 |
-		nvram[COBT_CMOS_BOOTCOUNT_2] << 8  |
-		nvram[COBT_CMOS_BOOTCOUNT_3]);
-
-	/* 13 bytes of serial num */
-	for (i=0 ; i<13 ; i++) {
-		sernum[i] = nvram[COBT_CMOS_SYS_SERNUM_0 + i];
-	}
-	sernum[13] = '\0';
-
-	checksum = 0;
-	for (i=0 ; i<13 ; i++) {
-		checksum += sernum[i] ^ key[i];
-	}
-	checksum = ((checksum & 0x7f) ^ (0xd6)) & 0xff;
-
-	PRINT_PROC("Serial Number: %s", sernum);
-	if (checksum != nvram[COBT_CMOS_SYS_SERNUM_CSUM]) {
-		PRINT_PROC(" (invalid checksum)");
-	}
-	PRINT_PROC("\n");
-
-	PRINT_PROC("Rom Revison: %d.%d.%d\n", nvram[COBT_CMOS_ROM_REV_MAJ],
-		nvram[COBT_CMOS_ROM_REV_MIN], nvram[COBT_CMOS_ROM_REV_REV]);
-
-	PRINT_PROC("BTO Server: %d.%d.%d.%d", nvram[COBT_CMOS_BTO_IP_0],
-		nvram[COBT_CMOS_BTO_IP_1], nvram[COBT_CMOS_BTO_IP_2],
-		nvram[COBT_CMOS_BTO_IP_3]);
-	bto_csum = nvram[COBT_CMOS_BTO_IP_0] + nvram[COBT_CMOS_BTO_IP_1]
-		+ nvram[COBT_CMOS_BTO_IP_2] + nvram[COBT_CMOS_BTO_IP_3];
-	if (bto_csum != nvram[COBT_CMOS_BTO_IP_CSUM]) {
-		PRINT_PROC(" (invalid checksum)");
-	}
-	PRINT_PROC("\n");
-
-	if (flags & COBT_CMOS_VERSION_FLAG
-	 && nvram[COBT_CMOS_VERSION] >= COBT_CMOS_VER_BTOCODE) {
-		PRINT_PROC("BTO Code: 0x%x\n",
-			nvram[COBT_CMOS_BTO_CODE_0] << 24 |
-			nvram[COBT_CMOS_BTO_CODE_1] << 16 |
-			nvram[COBT_CMOS_BTO_CODE_2] << 8 |
-			nvram[COBT_CMOS_BTO_CODE_3]);
-	}
-
-	return 1;
-}
-#endif /* CONFIG_PROC_FS */
-
-#endif /* MACH == COBALT */
-
 #if MACH == ATARI
 
 static int
diff --git a/include/linux/cobalt-nvram.h b/include/linux/cobalt-nvram.h
deleted file mode 100644
index ea429562ff36..000000000000
--- a/include/linux/cobalt-nvram.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * $Id: cobalt-nvram.h,v 1.20 2001/10/17 23:16:55 thockin Exp $
- * cobalt-nvram.h : defines for the various fields in the cobalt NVRAM
- *
- * Copyright 2001,2002 Sun Microsystems, Inc.
- */
-
-#ifndef COBALT_NVRAM_H
-#define COBALT_NVRAM_H
-
-#include <linux/nvram.h>
-
-#define COBT_CMOS_INFO_MAX		0x7f	/* top address allowed */
-#define COBT_CMOS_BIOS_DRIVE_INFO	0x12	/* drive info would go here */
-
-#define COBT_CMOS_CKS_START		NVRAM_OFFSET(0x0e)
-#define COBT_CMOS_CKS_END		NVRAM_OFFSET(0x7f)
-
-/* flag bytes - 16 flags for now, leave room for more */
-#define COBT_CMOS_FLAG_BYTE_0		NVRAM_OFFSET(0x10)
-#define COBT_CMOS_FLAG_BYTE_1		NVRAM_OFFSET(0x11)
-
-/* flags in flag bytes - up to 16 */
-#define COBT_CMOS_FLAG_MIN		0x0001
-#define COBT_CMOS_CONSOLE_FLAG		0x0001 /* console on/off */
-#define COBT_CMOS_DEBUG_FLAG		0x0002 /* ROM debug messages */
-#define COBT_CMOS_AUTO_PROMPT_FLAG	0x0004 /* boot to ROM prompt? */
-#define COBT_CMOS_CLEAN_BOOT_FLAG	0x0008 /* set by a clean shutdown */
-#define COBT_CMOS_HW_NOPROBE_FLAG	0x0010 /* go easy on the probing */
-#define COBT_CMOS_SYSFAULT_FLAG		0x0020 /* system fault detected */
-#define COBT_CMOS_OOPSPANIC_FLAG	0x0040 /* panic on oops */
-#define COBT_CMOS_DELAY_CACHE_FLAG	0x0080 /* delay cache initialization */
-#define COBT_CMOS_NOLOGO_FLAG		0x0100 /* hide "C" logo @ boot */
-#define COBT_CMOS_VERSION_FLAG		0x0200 /* the version field is valid */
-#define COBT_CMOS_FLAG_MAX		0x0200
-
-/* leave byte 0x12 blank - Linux looks for drive info here */
-
-/* CMOS structure version, valid if COBT_CMOS_VERSION_FLAG is true */
-#define COBT_CMOS_VERSION		NVRAM_OFFSET(0x13)
-#define COBT_CMOS_VER_BTOCODE		1 /* min. version needed for btocode */
-
-/* index of default boot method */
-#define COBT_CMOS_BOOT_METHOD		NVRAM_OFFSET(0x20)
-#define COBT_CMOS_BOOT_METHOD_DISK	0
-#define COBT_CMOS_BOOT_METHOD_ROM	1
-#define COBT_CMOS_BOOT_METHOD_NET	2
-
-#define COBT_CMOS_BOOT_DEV_MIN		NVRAM_OFFSET(0x21)
-/* major #, minor # of first through fourth boot device */
-#define COBT_CMOS_BOOT_DEV0_MAJ		NVRAM_OFFSET(0x21)
-#define COBT_CMOS_BOOT_DEV0_MIN		NVRAM_OFFSET(0x22)
-#define COBT_CMOS_BOOT_DEV1_MAJ		NVRAM_OFFSET(0x23)
-#define COBT_CMOS_BOOT_DEV1_MIN		NVRAM_OFFSET(0x24)
-#define COBT_CMOS_BOOT_DEV2_MAJ		NVRAM_OFFSET(0x25)
-#define COBT_CMOS_BOOT_DEV2_MIN		NVRAM_OFFSET(0x26)
-#define COBT_CMOS_BOOT_DEV3_MAJ		NVRAM_OFFSET(0x27)
-#define COBT_CMOS_BOOT_DEV3_MIN		NVRAM_OFFSET(0x28)
-#define COBT_CMOS_BOOT_DEV_MAX		NVRAM_OFFSET(0x28)
-
-/* checksum of bytes 0xe-0x7f */
-#define COBT_CMOS_CHECKSUM		NVRAM_OFFSET(0x2e)
-
-/* running uptime counter, units of 5 minutes (32 bits =~ 41000 years) */
-#define COBT_CMOS_UPTIME_0		NVRAM_OFFSET(0x30)
-#define COBT_CMOS_UPTIME_1		NVRAM_OFFSET(0x31)
-#define COBT_CMOS_UPTIME_2		NVRAM_OFFSET(0x32)
-#define COBT_CMOS_UPTIME_3		NVRAM_OFFSET(0x33)
-
-/* count of successful boots (32 bits) */
-#define COBT_CMOS_BOOTCOUNT_0		NVRAM_OFFSET(0x38)
-#define COBT_CMOS_BOOTCOUNT_1		NVRAM_OFFSET(0x39)
-#define COBT_CMOS_BOOTCOUNT_2		NVRAM_OFFSET(0x3a)
-#define COBT_CMOS_BOOTCOUNT_3		NVRAM_OFFSET(0x3b)
-
-/* 13 bytes: system serial number, same as on the back of the system */
-#define COBT_CMOS_SYS_SERNUM_LEN	13
-#define COBT_CMOS_SYS_SERNUM_0		NVRAM_OFFSET(0x40)
-#define COBT_CMOS_SYS_SERNUM_1		NVRAM_OFFSET(0x41)
-#define COBT_CMOS_SYS_SERNUM_2		NVRAM_OFFSET(0x42)
-#define COBT_CMOS_SYS_SERNUM_3		NVRAM_OFFSET(0x43)
-#define COBT_CMOS_SYS_SERNUM_4		NVRAM_OFFSET(0x44)
-#define COBT_CMOS_SYS_SERNUM_5		NVRAM_OFFSET(0x45)
-#define COBT_CMOS_SYS_SERNUM_6		NVRAM_OFFSET(0x46)
-#define COBT_CMOS_SYS_SERNUM_7		NVRAM_OFFSET(0x47)
-#define COBT_CMOS_SYS_SERNUM_8		NVRAM_OFFSET(0x48)
-#define COBT_CMOS_SYS_SERNUM_9		NVRAM_OFFSET(0x49)
-#define COBT_CMOS_SYS_SERNUM_10		NVRAM_OFFSET(0x4a)
-#define COBT_CMOS_SYS_SERNUM_11		NVRAM_OFFSET(0x4b)
-#define COBT_CMOS_SYS_SERNUM_12		NVRAM_OFFSET(0x4c)
-/* checksum for serial num - 1 byte */
-#define COBT_CMOS_SYS_SERNUM_CSUM	NVRAM_OFFSET(0x4f)
-
-#define COBT_CMOS_ROM_REV_MAJ		NVRAM_OFFSET(0x50)
-#define COBT_CMOS_ROM_REV_MIN		NVRAM_OFFSET(0x51)
-#define COBT_CMOS_ROM_REV_REV		NVRAM_OFFSET(0x52)
-
-#define COBT_CMOS_BTO_CODE_0		NVRAM_OFFSET(0x53)
-#define COBT_CMOS_BTO_CODE_1		NVRAM_OFFSET(0x54)
-#define COBT_CMOS_BTO_CODE_2		NVRAM_OFFSET(0x55)
-#define COBT_CMOS_BTO_CODE_3		NVRAM_OFFSET(0x56)
-
-#define COBT_CMOS_BTO_IP_CSUM		NVRAM_OFFSET(0x57)
-#define COBT_CMOS_BTO_IP_0		NVRAM_OFFSET(0x58)
-#define COBT_CMOS_BTO_IP_1		NVRAM_OFFSET(0x59)
-#define COBT_CMOS_BTO_IP_2		NVRAM_OFFSET(0x5a)
-#define COBT_CMOS_BTO_IP_3		NVRAM_OFFSET(0x5b)
-
-#endif /* COBALT_NVRAM_H */
-- 
cgit v1.3-8-gc7d7


From dcf5008db171211e3c34c060cacfd788306b034b Mon Sep 17 00:00:00 2001
From: Nathan Lynch <ntl@pobox.com>
Date: Sun, 15 Jul 2007 23:41:09 -0700
Subject: remove unused lock_cpu_hotplug_interruptible definition

aa95387774039096c11803c04011f1aa42d85758 removed the implementation of
lock_cpu_hotplug_interruptible and all users of it.  This stub definition
for !CONFIG_HOTPLUG_CPU was left over -- kill it now.

Signed-off-by: Nathan Lynch <ntl@pobox.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpu.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 3b2df2523f1d..c2236bbff412 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -120,7 +120,6 @@ static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
 
 #define lock_cpu_hotplug()	do { } while (0)
 #define unlock_cpu_hotplug()	do { } while (0)
-#define lock_cpu_hotplug_interruptible() 0
 #define hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
 #define register_hotcpu_notifier(nb)	do { (void)(nb); } while (0)
 #define unregister_hotcpu_notifier(nb)	do { (void)(nb); } while (0)
-- 
cgit v1.3-8-gc7d7


From f4895925976977aaeda26ee2a603a99f17db500b Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@mindspring.com>
Date: Sun, 15 Jul 2007 23:41:13 -0700
Subject: Remove final two references to "__obsolete_setup" macro

Signed-off-by: Robert P. J. Day <rpjday@mindspring.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/init.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index 56ec4c62eee0..5b5285316339 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -171,9 +171,6 @@ struct obs_kernel_param {
 #define __setup(str, fn)					\
 	__setup_param(str, fn, fn, 0)
 
-#define __obsolete_setup(str)					\
-	__setup_null_param(str, __LINE__)
-
 /* NOTE: fn is as per module_param, not __setup!  Emits warning if fn
  * returns non-zero. */
 #define early_param(str, fn)					\
@@ -239,7 +236,6 @@ void __init parse_early_param(void);
 #define __setup_param(str, unique_id, fn)	/* nothing */
 #define __setup_null_param(str, unique_id) 	/* nothing */
 #define __setup(str, func) 			/* nothing */
-#define __obsolete_setup(str) 			/* nothing */
 #endif
 
 /* Data marked not to be saved by software suspend */
-- 
cgit v1.3-8-gc7d7


From 213dd266d48af90c1eec8688c1ff31aa34d21de2 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 15 Jul 2007 23:41:15 -0700
Subject: namespace: ensure clone_flags are always stored in an unsigned long

While working on unshare support for the network namespace I noticed we
were putting clone flags in an int.  Which is weird because the syscall
uses unsigned long and we at least need an unsigned to properly hold all of
the unshare flags.

So to make the code consistent, this patch updates the code to use
unsigned long instead of int for the clone flags in those places
where we get it wrong today.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namespace.c                | 2 +-
 include/linux/mnt_namespace.h | 2 +-
 include/linux/nsproxy.h       | 2 +-
 include/linux/pid_namespace.h | 2 +-
 include/linux/utsname.h       | 3 ++-
 kernel/nsproxy.c              | 6 +++---
 kernel/pid.c                  | 2 +-
 kernel/utsname.c              | 2 +-
 8 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 5585623f6252..9211da4fef53 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1509,7 +1509,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	return new_ns;
 }
 
-struct mnt_namespace *copy_mnt_ns(int flags, struct mnt_namespace *ns,
+struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 		struct fs_struct *new_fs)
 {
 	struct mnt_namespace *new_ns;
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 1fa4d9813b31..8eed44f8ca73 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -14,7 +14,7 @@ struct mnt_namespace {
 	int event;
 };
 
-extern struct mnt_namespace *copy_mnt_ns(int, struct mnt_namespace *,
+extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
 		struct fs_struct *);
 extern void __put_mnt_ns(struct mnt_namespace *ns);
 
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 6d179a397bfb..ce06188b7a56 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -32,7 +32,7 @@ struct nsproxy {
 };
 extern struct nsproxy init_nsproxy;
 
-int copy_namespaces(int flags, struct task_struct *tsk);
+int copy_namespaces(unsigned long flags, struct task_struct *tsk);
 void get_task_namespaces(struct task_struct *tsk);
 void free_nsproxy(struct nsproxy *ns);
 int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 169c6c24209b..b9a17e08ff0f 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -29,7 +29,7 @@ static inline void get_pid_ns(struct pid_namespace *ns)
 	kref_get(&ns->kref);
 }
 
-extern struct pid_namespace *copy_pid_ns(int flags, struct pid_namespace *ns);
+extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *ns);
 extern void free_pid_ns(struct kref *kref);
 
 static inline void put_pid_ns(struct pid_namespace *ns)
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 51ad167611e4..923db99175f2 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -48,7 +48,8 @@ static inline void get_uts_ns(struct uts_namespace *ns)
 	kref_get(&ns->kref);
 }
 
-extern struct uts_namespace *copy_utsname(int flags, struct uts_namespace *ns);
+extern struct uts_namespace *copy_utsname(unsigned long flags,
+					struct uts_namespace *ns);
 extern void free_uts_ns(struct kref *kref);
 
 static inline void put_uts_ns(struct uts_namespace *ns)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 4b7fcc8f9a48..10f0bbba382b 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -58,8 +58,8 @@ static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
  * Return the newly created nsproxy.  Do not attach this to the task,
  * leave it to the caller to do proper locking and attach it to task.
  */
-static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk,
-			struct fs_struct *new_fs)
+static struct nsproxy *create_new_namespaces(unsigned long flags,
+			struct task_struct *tsk, struct fs_struct *new_fs)
 {
 	struct nsproxy *new_nsp;
 	int err;
@@ -121,7 +121,7 @@ out_ns:
  * called from clone.  This now handles copy for nsproxy and all
  * namespaces therein.
  */
-int copy_namespaces(int flags, struct task_struct *tsk)
+int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 {
 	struct nsproxy *old_ns = tsk->nsproxy;
 	struct nsproxy *new_ns;
diff --git a/kernel/pid.c b/kernel/pid.c
index eb66bd2953ab..c6e3f9ffff87 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -365,7 +365,7 @@ struct pid *find_ge_pid(int nr)
 }
 EXPORT_SYMBOL_GPL(find_get_pid);
 
-struct pid_namespace *copy_pid_ns(int flags, struct pid_namespace *old_ns)
+struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
 {
 	BUG_ON(!old_ns);
 	get_pid_ns(old_ns);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 3ae43936bd88..9d8180a0f0d8 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -39,7 +39,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
  * utsname of this process won't be seen by parent, and vice
  * versa.
  */
-struct uts_namespace *copy_utsname(int flags, struct uts_namespace *old_ns)
+struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns)
 {
 	struct uts_namespace *new_ns;
 
-- 
cgit v1.3-8-gc7d7


From 132e4b0a049c39337c535501561b8301c7f2b202 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@mindspring.com>
Date: Sun, 15 Jul 2007 23:41:19 -0700
Subject: cdrom: replace hard-coded constants by kernel.h macro.

Signed-off-by: Robert P. J. Day <rpjday@mindspring.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cdrom.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h
index f50f04bdbc16..2b641b176e7f 100644
--- a/include/linux/cdrom.h
+++ b/include/linux/cdrom.h
@@ -414,8 +414,8 @@ struct cdrom_generic_command
 #define CDO_CHECK_TYPE		0x10    /* check type on open for data */
 
 /* Special codes used when specifying changer slots. */
-#define CDSL_NONE       	((int) (~0U>>1)-1)
-#define CDSL_CURRENT    	((int) (~0U>>1))
+#define CDSL_NONE       	(INT_MAX-1)
+#define CDSL_CURRENT    	INT_MAX
 
 /* For partition based multisession access. IDE can handle 64 partitions
  * per drive - SCSI CD-ROM's use minors to differentiate between the
-- 
cgit v1.3-8-gc7d7


From 1d9d02feeee89e9132034d504c9a45eeaf618a3d Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <andrea@cpushare.com>
Date: Sun, 15 Jul 2007 23:41:32 -0700
Subject: move seccomp from /proc to a prctl

This reduces the memory footprint and it enforces that only the current
task can enable seccomp on itself (this is a requirement for a
strightforward [modulo preempt ;) ] TIF_NOTSC implementation).

Signed-off-by: Andrea Arcangeli <andrea@cpushare.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c          | 72 -------------------------------------------------
 include/linux/prctl.h   |  4 +++
 include/linux/seccomp.h | 15 +++++++++--
 kernel/seccomp.c        | 26 ++++++++++++++++++
 kernel/sys.c            |  8 ++++++
 5 files changed, 51 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index d0921944e68c..ae3627337a92 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -67,7 +67,6 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
-#include <linux/seccomp.h>
 #include <linux/cpuset.h>
 #include <linux/audit.h>
 #include <linux/poll.h>
@@ -817,71 +816,6 @@ static const struct file_operations proc_loginuid_operations = {
 };
 #endif
 
-#ifdef CONFIG_SECCOMP
-static ssize_t seccomp_read(struct file *file, char __user *buf,
-			    size_t count, loff_t *ppos)
-{
-	struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
-	char __buf[20];
-	size_t len;
-
-	if (!tsk)
-		return -ESRCH;
-	/* no need to print the trailing zero, so use only len */
-	len = sprintf(__buf, "%u\n", tsk->seccomp.mode);
-	put_task_struct(tsk);
-
-	return simple_read_from_buffer(buf, count, ppos, __buf, len);
-}
-
-static ssize_t seccomp_write(struct file *file, const char __user *buf,
-			     size_t count, loff_t *ppos)
-{
-	struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
-	char __buf[20], *end;
-	unsigned int seccomp_mode;
-	ssize_t result;
-
-	result = -ESRCH;
-	if (!tsk)
-		goto out_no_task;
-
-	/* can set it only once to be even more secure */
-	result = -EPERM;
-	if (unlikely(tsk->seccomp.mode))
-		goto out;
-
-	result = -EFAULT;
-	memset(__buf, 0, sizeof(__buf));
-	count = min(count, sizeof(__buf) - 1);
-	if (copy_from_user(__buf, buf, count))
-		goto out;
-
-	seccomp_mode = simple_strtoul(__buf, &end, 0);
-	if (*end == '\n')
-		end++;
-	result = -EINVAL;
-	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
-		tsk->seccomp.mode = seccomp_mode;
-		set_tsk_thread_flag(tsk, TIF_SECCOMP);
-	} else
-		goto out;
-	result = -EIO;
-	if (unlikely(!(end - __buf)))
-		goto out;
-	result = end - __buf;
-out:
-	put_task_struct(tsk);
-out_no_task:
-	return result;
-}
-
-static const struct file_operations proc_seccomp_operations = {
-	.read		= seccomp_read,
-	.write		= seccomp_write,
-};
-#endif /* CONFIG_SECCOMP */
-
 #ifdef CONFIG_FAULT_INJECTION
 static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
 				      size_t count, loff_t *ppos)
@@ -2042,9 +1976,6 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("numa_maps",  S_IRUGO, numa_maps),
 #endif
 	REG("mem",        S_IRUSR|S_IWUSR, mem),
-#ifdef CONFIG_SECCOMP
-	REG("seccomp",    S_IRUSR|S_IWUSR, seccomp),
-#endif
 	LNK("cwd",        cwd),
 	LNK("root",       root),
 	LNK("exe",        exe),
@@ -2329,9 +2260,6 @@ static const struct pid_entry tid_base_stuff[] = {
 	REG("numa_maps", S_IRUGO, numa_maps),
 #endif
 	REG("mem",       S_IRUSR|S_IWUSR, mem),
-#ifdef CONFIG_SECCOMP
-	REG("seccomp",   S_IRUSR|S_IWUSR, seccomp),
-#endif
 	LNK("cwd",       cwd),
 	LNK("root",      root),
 	LNK("exe",       exe),
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 52a9be41250d..e2eff9079fe9 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -59,4 +59,8 @@
 # define PR_ENDIAN_LITTLE	1	/* True little endian mode */
 # define PR_ENDIAN_PPC_LITTLE	2	/* "PowerPC" pseudo little endian */
 
+/* Get/set process seccomp mode */
+#define PR_GET_SECCOMP	21
+#define PR_SET_SECCOMP	22
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 3e8b1cf54303..d708974dbfe3 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -4,8 +4,6 @@
 
 #ifdef CONFIG_SECCOMP
 
-#define NR_SECCOMP_MODES 1
-
 #include <linux/thread_info.h>
 #include <asm/seccomp.h>
 
@@ -23,6 +21,9 @@ static inline int has_secure_computing(struct thread_info *ti)
 	return unlikely(test_ti_thread_flag(ti, TIF_SECCOMP));
 }
 
+extern long prctl_get_seccomp(void);
+extern long prctl_set_seccomp(unsigned long);
+
 #else /* CONFIG_SECCOMP */
 
 typedef struct { } seccomp_t;
@@ -34,6 +35,16 @@ static inline int has_secure_computing(struct thread_info *ti)
 	return 0;
 }
 
+static inline long prctl_get_seccomp(void)
+{
+	return -EINVAL;
+}
+
+static inline long prctl_set_seccomp(unsigned long arg2)
+{
+	return -EINVAL;
+}
+
 #endif /* CONFIG_SECCOMP */
 
 #endif /* _LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index c3391b6020e8..1dfa8a509726 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -10,6 +10,7 @@
 #include <linux/sched.h>
 
 /* #define SECCOMP_DEBUG 1 */
+#define NR_SECCOMP_MODES 1
 
 /*
  * Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -54,3 +55,28 @@ void __secure_computing(int this_syscall)
 #endif
 	do_exit(SIGKILL);
 }
+
+long prctl_get_seccomp(void)
+{
+	return current->seccomp.mode;
+}
+
+long prctl_set_seccomp(unsigned long seccomp_mode)
+{
+	long ret;
+
+	/* can set it only once to be even more secure */
+	ret = -EPERM;
+	if (unlikely(current->seccomp.mode))
+		goto out;
+
+	ret = -EINVAL;
+	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
+		current->seccomp.mode = seccomp_mode;
+		set_thread_flag(TIF_SECCOMP);
+		ret = 0;
+	}
+
+ out:
+	return ret;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index ed92e2f03342..4d141ae3e802 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -31,6 +31,7 @@
 #include <linux/cn_proc.h>
 #include <linux/getcpu.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/seccomp.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -2242,6 +2243,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			error = SET_ENDIAN(current, arg2);
 			break;
 
+		case PR_GET_SECCOMP:
+			error = prctl_get_seccomp();
+			break;
+		case PR_SET_SECCOMP:
+			error = prctl_set_seccomp(arg2);
+			break;
+
 		default:
 			error = -EINVAL;
 			break;
-- 
cgit v1.3-8-gc7d7


From cf99abace7e07dd8491e7093a9a9ef11d48838ed Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <andrea@cpushare.com>
Date: Sun, 15 Jul 2007 23:41:33 -0700
Subject: make seccomp zerocost in schedule

This follows a suggestion from Chuck Ebbert on how to make seccomp
absolutely zerocost in schedule too.  The only remaining footprint of
seccomp is in terms of the bzImage size that becomes a few bytes (perhaps
even a few kbytes) larger, measure it if you care in the embedded.

Signed-off-by: Andrea Arcangeli <andrea@cpushare.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/kernel/process.c     | 73 ++++++++++++++++++++++--------------------
 include/asm-i386/processor.h   |  4 +++
 include/asm-i386/thread_info.h |  5 ++-
 include/linux/seccomp.h        | 10 ------
 kernel/seccomp.c               |  3 ++
 5 files changed, 50 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 06dfa65ad180..6c49acb96982 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -538,8 +538,31 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
 	return 1;
 }
 
-static noinline void __switch_to_xtra(struct task_struct *next_p,
-				    struct tss_struct *tss)
+#ifdef CONFIG_SECCOMP
+void hard_disable_TSC(void)
+{
+	write_cr4(read_cr4() | X86_CR4_TSD);
+}
+void disable_TSC(void)
+{
+	preempt_disable();
+	if (!test_and_set_thread_flag(TIF_NOTSC))
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOTSC in the current running context.
+		 */
+		hard_disable_TSC();
+	preempt_enable();
+}
+void hard_enable_TSC(void)
+{
+	write_cr4(read_cr4() & ~X86_CR4_TSD);
+}
+#endif /* CONFIG_SECCOMP */
+
+static noinline void
+__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
+		 struct tss_struct *tss)
 {
 	struct thread_struct *next;
 
@@ -555,6 +578,17 @@ static noinline void __switch_to_xtra(struct task_struct *next_p,
 		set_debugreg(next->debugreg[7], 7);
 	}
 
+#ifdef CONFIG_SECCOMP
+	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+		/* prev and next are different */
+		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+			hard_disable_TSC();
+		else
+			hard_enable_TSC();
+	}
+#endif
+
 	if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
 		/*
 		 * Disable the bitmap via an invalid offset. We still cache
@@ -585,33 +619,6 @@ static noinline void __switch_to_xtra(struct task_struct *next_p,
 	tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
 }
 
-/*
- * This function selects if the context switch from prev to next
- * has to tweak the TSC disable bit in the cr4.
- */
-static inline void disable_tsc(struct task_struct *prev_p,
-			       struct task_struct *next_p)
-{
-	struct thread_info *prev, *next;
-
-	/*
-	 * gcc should eliminate the ->thread_info dereference if
-	 * has_secure_computing returns 0 at compile time (SECCOMP=n).
-	 */
-	prev = task_thread_info(prev_p);
-	next = task_thread_info(next_p);
-
-	if (has_secure_computing(prev) || has_secure_computing(next)) {
-		/* slow path here */
-		if (has_secure_computing(prev) &&
-		    !has_secure_computing(next)) {
-			write_cr4(read_cr4() & ~X86_CR4_TSD);
-		} else if (!has_secure_computing(prev) &&
-			   has_secure_computing(next))
-			write_cr4(read_cr4() | X86_CR4_TSD);
-	}
-}
-
 /*
  *	switch_to(x,yn) should switch tasks from x to y.
  *
@@ -689,11 +696,9 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 	/*
 	 * Now maybe handle debug registers and/or IO bitmaps
 	 */
-	if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)
-	    || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)))
-		__switch_to_xtra(next_p, tss);
-
-	disable_tsc(prev_p, next_p);
+	if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
+		     task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
+		__switch_to_xtra(prev_p, next_p, tss);
 
 	/*
 	 * Leave lazy mode, flushing any hypercalls made here.
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 94e0c147c165..422cffef00c9 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -228,6 +228,10 @@ extern int bootloader_type;
 
 #define HAVE_ARCH_PICK_MMAP_LAYOUT
 
+extern void hard_disable_TSC(void);
+extern void disable_TSC(void);
+extern void hard_enable_TSC(void);
+
 /*
  * Size of io_bitmap.
  */
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index 4cb0f91ae64f..54424e045e01 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -137,6 +137,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_DEBUG		17	/* uses debug registers */
 #define TIF_IO_BITMAP		18	/* uses I/O bitmap */
 #define TIF_FREEZE		19	/* is freezing for suspend */
+#define TIF_NOTSC		20	/* TSC is not accessible in userland */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
@@ -151,6 +152,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_DEBUG		(1<<TIF_DEBUG)
 #define _TIF_IO_BITMAP		(1<<TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
+#define _TIF_NOTSC		(1<<TIF_NOTSC)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
@@ -160,7 +162,8 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_ALLWORK_MASK	(0x0000FFFF & ~_TIF_SECCOMP)
 
 /* flags to check in __switch_to() */
-#define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP)
+#define _TIF_WORK_CTXSW_NEXT (_TIF_IO_BITMAP | _TIF_NOTSC | _TIF_DEBUG)
+#define _TIF_WORK_CTXSW_PREV (_TIF_IO_BITMAP | _TIF_NOTSC)
 
 /*
  * Thread-synchronous status.
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index d708974dbfe3..262a8dccfa81 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -16,11 +16,6 @@ static inline void secure_computing(int this_syscall)
 		__secure_computing(this_syscall);
 }
 
-static inline int has_secure_computing(struct thread_info *ti)
-{
-	return unlikely(test_ti_thread_flag(ti, TIF_SECCOMP));
-}
-
 extern long prctl_get_seccomp(void);
 extern long prctl_set_seccomp(unsigned long);
 
@@ -29,11 +24,6 @@ extern long prctl_set_seccomp(unsigned long);
 typedef struct { } seccomp_t;
 
 #define secure_computing(x) do { } while (0)
-/* static inline to preserve typechecking */
-static inline int has_secure_computing(struct thread_info *ti)
-{
-	return 0;
-}
 
 static inline long prctl_get_seccomp(void)
 {
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 1dfa8a509726..ad64fcb731f2 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -74,6 +74,9 @@ long prctl_set_seccomp(unsigned long seccomp_mode)
 	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
 		current->seccomp.mode = seccomp_mode;
 		set_thread_flag(TIF_SECCOMP);
+#ifdef TIF_NOTSC
+		disable_TSC();
+#endif
 		ret = 0;
 	}
 
-- 
cgit v1.3-8-gc7d7


From cc2ea416b2aa04d0c34ff2281a23dae5b76b7b3b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Sun, 15 Jul 2007 23:41:38 -0700
Subject: uninline check_signature()

This is a rather bizarre thing to have inlined in io.h.  Stick it in lib/
instead.

While we're there, despaghetti it a bit, and fix its off-by-one behaviour when
passed a zero length.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/io.h    | 29 ++---------------------------
 lib/Makefile          |  2 +-
 lib/check_signature.c | 26 ++++++++++++++++++++++++++
 3 files changed, 29 insertions(+), 28 deletions(-)
 create mode 100644 lib/check_signature.c

(limited to 'include/linux')

diff --git a/include/linux/io.h b/include/linux/io.h
index 8423dd376514..e3b2dda6c8eb 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -63,32 +63,7 @@ void __iomem * devm_ioremap(struct device *dev, unsigned long offset,
 void __iomem * devm_ioremap_nocache(struct device *dev, unsigned long offset,
 				    unsigned long size);
 void devm_iounmap(struct device *dev, void __iomem *addr);
-
-/**
- *	check_signature		-	find BIOS signatures
- *	@io_addr: mmio address to check
- *	@signature:  signature block
- *	@length: length of signature
- *
- *	Perform a signature comparison with the mmio address io_addr. This
- *	address should have been obtained by ioremap.
- *	Returns 1 on a match.
- */
-
-static inline int check_signature(const volatile void __iomem *io_addr,
-	const unsigned char *signature, int length)
-{
-	int retval = 0;
-	do {
-		if (readb(io_addr) != *signature)
-			goto out;
-		io_addr++;
-		signature++;
-		length--;
-	} while (length);
-	retval = 1;
-out:
-	return retval;
-}
+int check_signature(const volatile void __iomem *io_addr,
+			const unsigned char *signature, int length);
 
 #endif /* _LINUX_IO_H */
diff --git a/lib/Makefile b/lib/Makefile
index d1b366bdf86e..d7a93ff7f5a0 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -13,7 +13,7 @@ lib-$(CONFIG_SMP) += cpumask.o
 lib-y	+= kobject.o kref.o kobject_uevent.o klist.o
 
 obj-y += div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
-	 bust_spinlocks.o hexdump.o
+	 bust_spinlocks.o hexdump.o check_signature.o
 
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG
diff --git a/lib/check_signature.c b/lib/check_signature.c
new file mode 100644
index 000000000000..fd6af199247b
--- /dev/null
+++ b/lib/check_signature.c
@@ -0,0 +1,26 @@
+#include <linux/io.h>
+#include <linux/module.h>
+
+/**
+ *	check_signature		-	find BIOS signatures
+ *	@io_addr: mmio address to check
+ *	@signature:  signature block
+ *	@length: length of signature
+ *
+ *	Perform a signature comparison with the mmio address io_addr. This
+ *	address should have been obtained by ioremap.
+ *	Returns 1 on a match.
+ */
+
+int check_signature(const volatile void __iomem *io_addr,
+			const unsigned char *signature, int length)
+{
+	while (length--) {
+		if (readb(io_addr) != *signature)
+			return 0;
+		io_addr++;
+		signature++;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(check_signature);
-- 
cgit v1.3-8-gc7d7


From 608e2619682e951f525b08e7a48669a3c0263b41 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sun, 15 Jul 2007 23:41:39 -0700
Subject: generic bug: use show_regs() instead of dump_stack()

The current generic bug implementation has a call to dump_stack() in case a
WARN_ON(whatever) gets hit.  Since report_bug(), which calls dump_stack(),
gets called from an exception handler we can do better: just pass the
pt_regs structure to report_bug() and pass it to show_regs() in case of a
warning.  This will give more debug informations like register contents,
etc...  In addition this avoids some pointless lines that dump_stack()
emits, since it includes a stack backtrace of the exception handler which
is of no interest in case of a warning.  E.g.  on s390 the following lines
are currently always present in a stack backtrace if dump_stack() gets
called from report_bug():

 [<000000000001517a>] show_trace+0x92/0xe8)
 [<0000000000015270>] show_stack+0xa0/0xd0
 [<00000000000152ce>] dump_stack+0x2e/0x3c
 [<0000000000195450>] report_bug+0x98/0xf8
 [<0000000000016cc8>] illegal_op+0x1fc/0x21c
 [<00000000000227d6>] sysc_return+0x0/0x10

Acked-by: Jeremy Fitzhardinge <jeremy@goop.org>
Acked-by: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Kyle McMartin <kyle@parisc-linux.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/avr32/kernel/traps.c   | 2 +-
 arch/i386/kernel/traps.c    | 2 +-
 arch/parisc/kernel/traps.c  | 2 +-
 arch/powerpc/kernel/traps.c | 2 +-
 arch/ppc/kernel/traps.c     | 2 +-
 arch/s390/kernel/traps.c    | 2 +-
 arch/sh/kernel/traps.c      | 2 +-
 arch/x86_64/kernel/traps.c  | 2 +-
 include/linux/bug.h         | 7 +++++--
 lib/bug.c                   | 5 +++--
 10 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c
index 86d107511dd4..aaa792815cd7 100644
--- a/arch/avr32/kernel/traps.c
+++ b/arch/avr32/kernel/traps.c
@@ -184,7 +184,7 @@ asmlinkage void do_illegal_opcode(unsigned long ecr, struct pt_regs *regs)
 	if (!user_mode(regs) && (ecr == ECR_ILLEGAL_OPCODE)) {
 		enum bug_trap_type type;
 
-		type = report_bug(regs->pc);
+		type = report_bug(regs->pc, regs);
 		switch (type) {
 		case BUG_TRAP_TYPE_NONE:
 			break;
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 90da0575fcff..28bd1c5163ec 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -390,7 +390,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 		unsigned long esp;
 		unsigned short ss;
 
-		report_bug(regs->eip);
+		report_bug(regs->eip, regs);
 
 		printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
 #ifdef CONFIG_PREEMPT
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index c3ec9f1ec0f3..f9bca2d74b38 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -302,7 +302,7 @@ static void handle_break(struct pt_regs *regs)
 	if (unlikely(iir == PARISC_BUG_BREAK_INSN && !user_mode(regs))) {
 		/* check if a BUG() or WARN() trapped here.  */
 		enum bug_trap_type tt;
-		tt = report_bug(regs->iaoq[0] & ~3);
+		tt = report_bug(regs->iaoq[0] & ~3, regs);
 		if (tt == BUG_TRAP_TYPE_WARN) {
 			regs->iaoq[0] += 4;
 			regs->iaoq[1] += 4;
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index bf6445ac9f1c..3b8427e6283d 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -777,7 +777,7 @@ void __kprobes program_check_exception(struct pt_regs *regs)
 			return;
 
 		if (!(regs->msr & MSR_PR) &&  /* not user-mode */
-		    report_bug(regs->nip) == BUG_TRAP_TYPE_WARN) {
+		    report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) {
 			regs->nip += 4;
 			return;
 		}
diff --git a/arch/ppc/kernel/traps.c b/arch/ppc/kernel/traps.c
index aea100be52c8..0eaef7c8378b 100644
--- a/arch/ppc/kernel/traps.c
+++ b/arch/ppc/kernel/traps.c
@@ -619,7 +619,7 @@ void program_check_exception(struct pt_regs *regs)
 			return;
 
 		if (!(regs->msr & MSR_PR) &&  /* not user-mode */
-		    report_bug(regs->nip) == BUG_TRAP_TYPE_WARN) {
+		    report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) {
 			regs->nip += 4;
 			return;
 		}
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index ee9186f8fb08..81e03b9c3841 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -319,7 +319,7 @@ static void __kprobes inline do_trap(long interruption_code, int signr,
 		else {
 			enum bug_trap_type btt;
 
-			btt = report_bug(regs->psw.addr & PSW_ADDR_INSN);
+			btt = report_bug(regs->psw.addr & PSW_ADDR_INSN, regs);
 			if (btt == BUG_TRAP_TYPE_WARN)
 				return;
 			die(str, regs, interruption_code);
diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c
index 8f18930d5bf8..09480887076b 100644
--- a/arch/sh/kernel/traps.c
+++ b/arch/sh/kernel/traps.c
@@ -874,7 +874,7 @@ void __init trap_init(void)
 void handle_BUG(struct pt_regs *regs)
 {
 	enum bug_trap_type tt;
-	tt = report_bug(regs->pc);
+	tt = report_bug(regs->pc, regs);
 	if (tt == BUG_TRAP_TYPE_WARN) {
 		regs->pc += 2;
 		return;
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index aac1c0be54c6..7fa155c394d9 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -531,7 +531,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 	unsigned long flags = oops_begin();
 
 	if (!user_mode(regs))
-		report_bug(regs->rip);
+		report_bug(regs->rip, regs);
 
 	__die(str, regs, err);
 	oops_end(flags);
diff --git a/include/linux/bug.h b/include/linux/bug.h
index 42aa0a54b6f4..54398d2c6d8d 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -10,6 +10,8 @@ enum bug_trap_type {
 	BUG_TRAP_TYPE_BUG = 2,
 };
 
+struct pt_regs;
+
 #ifdef CONFIG_GENERIC_BUG
 #include <asm-generic/bug.h>
 
@@ -20,7 +22,7 @@ static inline int is_warning_bug(const struct bug_entry *bug)
 
 const struct bug_entry *find_bug(unsigned long bugaddr);
 
-enum bug_trap_type report_bug(unsigned long bug_addr);
+enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs);
 
 int  module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
 			 struct module *);
@@ -31,7 +33,8 @@ int is_valid_bugaddr(unsigned long addr);
 
 #else	/* !CONFIG_GENERIC_BUG */
 
-static inline enum bug_trap_type report_bug(unsigned long bug_addr)
+static inline enum bug_trap_type report_bug(unsigned long bug_addr,
+					    struct pt_regs *regs)
 {
 	return BUG_TRAP_TYPE_BUG;
 }
diff --git a/lib/bug.c b/lib/bug.c
index 014b582c5c4b..530f38f55787 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -38,6 +38,7 @@
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/bug.h>
+#include <linux/sched.h>
 
 extern const struct bug_entry __start___bug_table[], __stop___bug_table[];
 
@@ -112,7 +113,7 @@ const struct bug_entry *find_bug(unsigned long bugaddr)
 	return module_find_bug(bugaddr);
 }
 
-enum bug_trap_type report_bug(unsigned long bugaddr)
+enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 {
 	const struct bug_entry *bug;
 	const char *file;
@@ -147,7 +148,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr)
 			       "[verbose debug info unavailable]\n",
 			       (void *)bugaddr);
 
-		dump_stack();
+		show_regs(regs);
 		return BUG_TRAP_TYPE_WARN;
 	}
 
-- 
cgit v1.3-8-gc7d7


From d52988023a37720e9e4aeb66362be67fa21d8836 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Sun, 15 Jul 2007 23:41:42 -0700
Subject: Remove the last few UMSDOS leftovers

The UMSDOS filesystem was removed back in 2.6.11, but some tiny bits stuck
around.  This patch removes the few remaining leftovers.  The only things
left behind after this are the entries in the CREDITS file and the ioctl
number in Documentation/ioctl-number.txt as documentation.

This third (hopefully final) version of the patch doesn't edit the
arch/um/config.release file, since Jeff Dike pointed out to me that it
should die completely, and asked me to remove it from my patch as he'll
send in a seperate patch removing the file completely.

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/ioctl-number.txt | 2 +-
 arch/arm26/defconfig           | 1 -
 arch/cris/arch-v10/defconfig   | 1 -
 include/linux/ncp_fs.h         | 2 --
 4 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ioctl-number.txt b/Documentation/ioctl-number.txt
index 3de7d379cf07..5c7fbf9d96b4 100644
--- a/Documentation/ioctl-number.txt
+++ b/Documentation/ioctl-number.txt
@@ -67,7 +67,7 @@ Code	Seq#	Include File		Comments
 0x00	00-1F	linux/wavefront.h	conflict!
 0x02	all	linux/fd.h
 0x03	all	linux/hdreg.h
-0x04	all	linux/umsdos_fs.h
+0x04	D2-DC	linux/umsdos_fs.h	Dead since 2.6.11, but don't reuse these.
 0x06	all	linux/lp.h
 0x09	all	linux/md.h
 0x12	all	linux/fs.h
diff --git a/arch/arm26/defconfig b/arch/arm26/defconfig
index c4a89703c3d8..2b7d44bf49bf 100644
--- a/arch/arm26/defconfig
+++ b/arch/arm26/defconfig
@@ -248,7 +248,6 @@ CONFIG_I2C_CHARDEV=y
 # CONFIG_JBD_DEBUG is not set
 # CONFIG_FAT_FS is not set
 # CONFIG_MSDOS_FS is not set
-# CONFIG_UMSDOS_FS is not set
 # CONFIG_VFAT_FS is not set
 # CONFIG_EFS_FS is not set
 # CONFIG_JFFS_FS is not set
diff --git a/arch/cris/arch-v10/defconfig b/arch/cris/arch-v10/defconfig
index 2a3411eaace9..710c20ba2be7 100644
--- a/arch/cris/arch-v10/defconfig
+++ b/arch/cris/arch-v10/defconfig
@@ -429,7 +429,6 @@ CONFIG_NET_ETHERNET=y
 # CONFIG_BFS_FS is not set
 # CONFIG_FAT_FS is not set
 # CONFIG_MSDOS_FS is not set
-# CONFIG_UMSDOS_FS is not set
 # CONFIG_VFAT_FS is not set
 # CONFIG_EFS_FS is not set
 # CONFIG_JFFS_FS is not set
diff --git a/include/linux/ncp_fs.h b/include/linux/ncp_fs.h
index 83e39eb054d3..88766e43e121 100644
--- a/include/linux/ncp_fs.h
+++ b/include/linux/ncp_fs.h
@@ -148,8 +148,6 @@ struct ncp_nls_ioctl
 #include <linux/ncp_fs_i.h>
 #include <linux/ncp_fs_sb.h>
 
-/* undef because public define in umsdos_fs.h (ncp_fs.h isn't public) */
-#undef PRINTK
 /* define because it is easy to change PRINTK to {*}PRINTK */
 #define PRINTK(format, args...) printk(KERN_DEBUG format , ## args)
 
-- 
cgit v1.3-8-gc7d7


From f5a421a4509a7e2dff11da0f01b0548f4f84d503 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 15 Jul 2007 23:41:44 -0700
Subject: rename cancel_rearming_delayed_work() to cancel_delayed_work_sync()

Imho, the current naming of cancel_xxx workqueue functions is very confusing.

	cancel_delayed_work()
	cancel_rearming_delayed_work()
	cancel_rearming_delayed_workqueue()	// obsolete

	cancel_work_sync()

This looks as if the first 2 functions differ in "type" of their argument
which is not true any longer, nowadays the difference is the behaviour.

The semantics of cancel_rearming_delayed_work(dwork) was changed
significantly, it doesn't require that dwork rearms itself, and cancels dwork
synchronously.

Rename it to cancel_delayed_work_sync().  This matches cancel_delayed_work()
and cancel_work_sync().  Re-create cancel_rearming_delayed_work() as a simple
inline obsolete wrapper, like cancel_rearming_delayed_workqueue().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Jarek Poplawski <jarkao2@o2.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/workqueue.h | 13 ++++++++++---
 kernel/workqueue.c        |  6 +++---
 2 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index ce0719a2cfeb..5c89ac6e7f55 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -166,14 +166,21 @@ static inline int cancel_delayed_work(struct delayed_work *work)
 	return ret;
 }
 
-extern void cancel_rearming_delayed_work(struct delayed_work *work);
+extern void cancel_delayed_work_sync(struct delayed_work *work);
 
-/* Obsolete. use cancel_rearming_delayed_work() */
+/* Obsolete. use cancel_delayed_work_sync() */
 static inline
 void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
 					struct delayed_work *work)
 {
-	cancel_rearming_delayed_work(work);
+	cancel_delayed_work_sync(work);
+}
+
+/* Obsolete. use cancel_delayed_work_sync() */
+static inline
+void cancel_rearming_delayed_work(struct delayed_work *work)
+{
+	cancel_delayed_work_sync(work);
 }
 
 #endif
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3bebf73be976..ad9656886daa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -486,13 +486,13 @@ void cancel_work_sync(struct work_struct *work)
 EXPORT_SYMBOL_GPL(cancel_work_sync);
 
 /**
- * cancel_rearming_delayed_work - reliably kill off a delayed work.
+ * cancel_delayed_work_sync - reliably kill off a delayed work.
  * @dwork: the delayed work struct
  *
  * It is possible to use this function if @dwork rearms itself via queue_work()
  * or queue_delayed_work(). See also the comment for cancel_work_sync().
  */
-void cancel_rearming_delayed_work(struct delayed_work *dwork)
+void cancel_delayed_work_sync(struct delayed_work *dwork)
 {
 	while (!del_timer(&dwork->timer) &&
 	       !try_to_grab_pending(&dwork->work))
@@ -500,7 +500,7 @@ void cancel_rearming_delayed_work(struct delayed_work *dwork)
 	wait_on_work(&dwork->work);
 	work_clear_pending(&dwork->work);
 }
-EXPORT_SYMBOL(cancel_rearming_delayed_work);
+EXPORT_SYMBOL(cancel_delayed_work_sync);
 
 static struct workqueue_struct *keventd_wq __read_mostly;
 
-- 
cgit v1.3-8-gc7d7


From 1f1f642e2f092e37eb9038060eb0100c44f55a11 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 15 Jul 2007 23:41:44 -0700
Subject: make cancel_xxx_work_sync() return a boolean

Change cancel_work_sync() and cancel_delayed_work_sync() to return a boolean
indicating whether the work was actually cancelled.  A zero return value means
that the work was not pending/queued.

Without that kind of change it is not possible to avoid flush_workqueue()
sometimes, see the next patch as an example.

Also, this patch unifies both functions and kills the (unlikely) busy-wait
loop.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Jarek Poplawski <jarkao2@o2.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/workqueue.h |  4 ++--
 kernel/workqueue.c        | 41 +++++++++++++++++++++++++++--------------
 2 files changed, 29 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 5c89ac6e7f55..ce6badc98f6d 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -148,7 +148,7 @@ extern int keventd_up(void);
 extern void init_workqueues(void);
 int execute_in_process_context(work_func_t fn, struct execute_work *);
 
-extern void cancel_work_sync(struct work_struct *work);
+extern int cancel_work_sync(struct work_struct *work);
 
 /*
  * Kill off a pending schedule_delayed_work().  Note that the work callback
@@ -166,7 +166,7 @@ static inline int cancel_delayed_work(struct delayed_work *work)
 	return ret;
 }
 
-extern void cancel_delayed_work_sync(struct delayed_work *work);
+extern int cancel_delayed_work_sync(struct delayed_work *work);
 
 /* Obsolete. use cancel_delayed_work_sync() */
 static inline
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ad9656886daa..d7d3fa3072e5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -382,16 +382,16 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
 /*
- * Upon a successful return, the caller "owns" WORK_STRUCT_PENDING bit,
+ * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
  * so this work can't be re-armed in any way.
  */
 static int try_to_grab_pending(struct work_struct *work)
 {
 	struct cpu_workqueue_struct *cwq;
-	int ret = 0;
+	int ret = -1;
 
 	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
-		return 1;
+		return 0;
 
 	/*
 	 * The queueing is in progress, or it is already queued. Try to
@@ -457,10 +457,28 @@ static void wait_on_work(struct work_struct *work)
 		wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
 
+static int __cancel_work_timer(struct work_struct *work,
+				struct timer_list* timer)
+{
+	int ret;
+
+	do {
+		ret = (timer && likely(del_timer(timer)));
+		if (!ret)
+			ret = try_to_grab_pending(work);
+		wait_on_work(work);
+	} while (unlikely(ret < 0));
+
+	work_clear_pending(work);
+	return ret;
+}
+
 /**
  * cancel_work_sync - block until a work_struct's callback has terminated
  * @work: the work which is to be flushed
  *
+ * Returns true if @work was pending.
+ *
  * cancel_work_sync() will cancel the work if it is queued. If the work's
  * callback appears to be running, cancel_work_sync() will block until it
  * has completed.
@@ -476,12 +494,9 @@ static void wait_on_work(struct work_struct *work)
  * The caller must ensure that workqueue_struct on which this work was last
  * queued can't be destroyed before this function returns.
  */
-void cancel_work_sync(struct work_struct *work)
+int cancel_work_sync(struct work_struct *work)
 {
-	while (!try_to_grab_pending(work))
-		cpu_relax();
-	wait_on_work(work);
-	work_clear_pending(work);
+	return __cancel_work_timer(work, NULL);
 }
 EXPORT_SYMBOL_GPL(cancel_work_sync);
 
@@ -489,16 +504,14 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
  * cancel_delayed_work_sync - reliably kill off a delayed work.
  * @dwork: the delayed work struct
  *
+ * Returns true if @dwork was pending.
+ *
  * It is possible to use this function if @dwork rearms itself via queue_work()
  * or queue_delayed_work(). See also the comment for cancel_work_sync().
  */
-void cancel_delayed_work_sync(struct delayed_work *dwork)
+int cancel_delayed_work_sync(struct delayed_work *dwork)
 {
-	while (!del_timer(&dwork->timer) &&
-	       !try_to_grab_pending(&dwork->work))
-		cpu_relax();
-	wait_on_work(&dwork->work);
-	work_clear_pending(&dwork->work);
+	return __cancel_work_timer(&dwork->work, &dwork->timer);
 }
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
-- 
cgit v1.3-8-gc7d7


From 8f8a68ee486e1c81eaead3c521822bf86142d380 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 15 Jul 2007 23:41:52 -0700
Subject: remove mm/backing-dev.c:congestion_wait_interruptible()

congestion_wait_interruptible() is no longer used.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/backing-dev.h |  1 -
 mm/backing-dev.c            | 16 ----------------
 2 files changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f2542c24b328..7011d6255593 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -93,7 +93,6 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi)
 void clear_bdi_congested(struct backing_dev_info *bdi, int rw);
 void set_bdi_congested(struct backing_dev_info *bdi, int rw);
 long congestion_wait(int rw, long timeout);
-long congestion_wait_interruptible(int rw, long timeout);
 void congestion_end(int rw);
 
 #define bdi_cap_writeback_dirty(bdi) \
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e5de3781d3fe..f50a2811f9dc 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -55,22 +55,6 @@ long congestion_wait(int rw, long timeout)
 }
 EXPORT_SYMBOL(congestion_wait);
 
-long congestion_wait_interruptible(int rw, long timeout)
-{
-	long ret;
-	DEFINE_WAIT(wait);
-	wait_queue_head_t *wqh = &congestion_wqh[rw];
-
-	prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
-	if (signal_pending(current))
-		ret = -ERESTARTSYS;
-	else
-		ret = io_schedule_timeout(timeout);
-	finish_wait(wqh, &wait);
-	return ret;
-}
-EXPORT_SYMBOL(congestion_wait_interruptible);
-
 /**
  * congestion_end - wake up sleepers on a congested backing_dev_info
  * @rw: READ or WRITE
-- 
cgit v1.3-8-gc7d7


From 2235219b7721b8e74de6841e79240936561a2b63 Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Sun, 15 Jul 2007 23:41:58 -0700
Subject: ext2: statfs speed up

This is a patch that speeds up statfs.  It is very simple - the "overhead"
calculation, which takes a huge amount of time for large filesystems, never
changes unless the size of the filesystem itself changes.  That means we can
store it in memory and only recalculate if the filesystem has been resized
(almost never).

It also fixes a minor problem that we never update the on-disk superblock free
blocks/inodes counts until the filesystem is unmounted.  While not fatal, we
may as well update that on disk when we have the information, and it makes
things like debugfs and dumpe2fs report a bit more accurate info.

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/super.c            | 20 ++++++++++++++------
 include/linux/ext2_fs_sb.h |  2 ++
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 5de5061eb331..b2efd9083b9b 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1099,15 +1099,18 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 	struct super_block *sb = dentry->d_sb;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	struct ext2_super_block *es = sbi->s_es;
-	unsigned long overhead;
-	int i;
 	u64 fsid;
 
 	if (test_opt (sb, MINIX_DF))
-		overhead = 0;
-	else {
+		sbi->s_overhead_last = 0;
+	else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
+		unsigned long i, overhead = 0;
+		smp_rmb();
+
 		/*
-		 * Compute the overhead (FS structures)
+		 * Compute the overhead (FS structures). This is constant
+		 * for a given filesystem unless the number of block groups
+		 * changes so we cache the previous value until it does.
 		 */
 
 		/*
@@ -1131,17 +1134,22 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 		 */
 		overhead += (sbi->s_groups_count *
 			     (2 + sbi->s_itb_per_group));
+		sbi->s_overhead_last = overhead;
+		smp_wmb();
+		sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
 	}
 
 	buf->f_type = EXT2_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
-	buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead;
+	buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
 	buf->f_bfree = ext2_count_free_blocks(sb);
+	es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
 	buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
 	if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
 		buf->f_bavail = 0;
 	buf->f_files = le32_to_cpu(es->s_inodes_count);
 	buf->f_ffree = ext2_count_free_inodes(sb);
+	es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
 	buf->f_namelen = EXT2_NAME_LEN;
 	fsid = le64_to_cpup((void *)es->s_uuid) ^
 	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
diff --git a/include/linux/ext2_fs_sb.h b/include/linux/ext2_fs_sb.h
index 4eda0ed76a48..d149f2959e67 100644
--- a/include/linux/ext2_fs_sb.h
+++ b/include/linux/ext2_fs_sb.h
@@ -33,6 +33,8 @@ struct ext2_sb_info {
 	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
 	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
 	unsigned long s_groups_count;	/* Number of groups in the fs */
+	unsigned long s_overhead_last;  /* Last calculated overhead */
+	unsigned long s_blocks_last;    /* Last seen block count */
 	struct buffer_head * s_sbh;	/* Buffer containing the super block */
 	struct ext2_super_block * s_es;	/* Pointer to the super block in the buffer */
 	struct buffer_head ** s_group_desc;
-- 
cgit v1.3-8-gc7d7


From a71ce8c6c9bf269b192f352ea555217815cf027e Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Sun, 15 Jul 2007 23:41:59 -0700
Subject: ext3: statfs speed up

This is a patch that speeds up statfs.  It is very simple - the "overhead"
calculation, which takes a huge amount of time for large filesystems, never
changes unless the size of the filesystem itself changes.  That means we can
store it in memory and only recalculate if the filesystem has been resized
(almost never).

It also fixes a minor problem that we never update the on-disk superblock free
blocks/inodes counts until the filesystem is unmounted.  While not fatal, we
may as well update that on disk when we have the information, and it makes
things like debugfs and dumpe2fs report a bit more accurate info.

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c            | 25 +++++++++++++++----------
 include/linux/ext3_fs_sb.h |  2 ++
 2 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 8b75f73ba3b4..51d1c456cdab 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2426,19 +2426,19 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
 	struct super_block *sb = dentry->d_sb;
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	struct ext3_super_block *es = sbi->s_es;
-	ext3_fsblk_t overhead;
-	int i;
 	u64 fsid;
 
-	if (test_opt (sb, MINIX_DF))
-		overhead = 0;
-	else {
-		unsigned long ngroups;
-		ngroups = EXT3_SB(sb)->s_groups_count;
+	if (test_opt(sb, MINIX_DF)) {
+		sbi->s_overhead_last = 0;
+	} else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
+		unsigned long ngroups = sbi->s_groups_count, i;
+		ext3_fsblk_t overhead = 0;
 		smp_rmb();
 
 		/*
-		 * Compute the overhead (FS structures)
+		 * Compute the overhead (FS structures).  This is constant
+		 * for a given filesystem unless the number of block groups
+		 * changes so we cache the previous value until it does.
 		 */
 
 		/*
@@ -2462,18 +2462,23 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
 		 * Every block group has an inode bitmap, a block
 		 * bitmap, and an inode table.
 		 */
-		overhead += (ngroups * (2 + EXT3_SB(sb)->s_itb_per_group));
+		overhead += ngroups * (2 + sbi->s_itb_per_group);
+		sbi->s_overhead_last = overhead;
+		smp_wmb();
+		sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
 	}
 
 	buf->f_type = EXT3_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
-	buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead;
+	buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
 	buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
+	es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
 	buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
 	if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
 		buf->f_bavail = 0;
 	buf->f_files = le32_to_cpu(es->s_inodes_count);
 	buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
+	es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
 	buf->f_namelen = EXT3_NAME_LEN;
 	fsid = le64_to_cpup((void *)es->s_uuid) ^
 	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
diff --git a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h
index f61309c81cc4..d3c08353edf6 100644
--- a/include/linux/ext3_fs_sb.h
+++ b/include/linux/ext3_fs_sb.h
@@ -38,6 +38,8 @@ struct ext3_sb_info {
 	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
 	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
 	unsigned long s_groups_count;	/* Number of groups in the fs */
+	unsigned long s_overhead_last;  /* Last calculated overhead */
+	unsigned long s_blocks_last;    /* Last seen block count */
 	struct buffer_head * s_sbh;	/* Buffer containing the super block */
 	struct ext3_super_block * s_es;	/* Pointer to the super block in the buffer */
 	struct buffer_head ** s_group_desc;
-- 
cgit v1.3-8-gc7d7


From 5e70030d4cf91613530a23b40ad9919bb9ee114f Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Sun, 15 Jul 2007 23:42:00 -0700
Subject: ext4: statfs speed up

This is a patch that speeds up statfs.  It is very simple - the "overhead"
calculation, which takes a huge amount of time for large filesystems, never
changes unless the size of the filesystem itself changes.  That means we can
store it in memory and only recalculate if the filesystem has been resized
(almost never).

It also fixes a minor problem that we never update the on-disk superblock free
blocks/inodes counts until the filesystem is unmounted.  While not fatal, we
may as well update that on disk when we have the information, and it makes
things like debugfs and dumpe2fs report a bit more accurate info.

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/super.c            | 25 +++++++++++++++----------
 include/linux/ext4_fs_sb.h |  2 ++
 2 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index adcbfadfcb4c..d0d8c76c7edb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2500,19 +2500,19 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
 	struct super_block *sb = dentry->d_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
-	ext4_fsblk_t overhead;
-	int i;
 	u64 fsid;
 
-	if (test_opt (sb, MINIX_DF))
-		overhead = 0;
-	else {
-		unsigned long ngroups;
-		ngroups = EXT4_SB(sb)->s_groups_count;
+	if (test_opt(sb, MINIX_DF)) {
+		sbi->s_overhead_last = 0;
+	} else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
+		unsigned long ngroups = sbi->s_groups_count, i;
+		ext4_fsblk_t overhead = 0;
 		smp_rmb();
 
 		/*
-		 * Compute the overhead (FS structures)
+		 * Compute the overhead (FS structures).  This is constant
+		 * for a given filesystem unless the number of block groups
+		 * changes so we cache the previous value until it does.
 		 */
 
 		/*
@@ -2536,18 +2536,23 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
 		 * Every block group has an inode bitmap, a block
 		 * bitmap, and an inode table.
 		 */
-		overhead += (ngroups * (2 + EXT4_SB(sb)->s_itb_per_group));
+		overhead += ngroups * (2 + sbi->s_itb_per_group);
+		sbi->s_overhead_last = overhead;
+		smp_wmb();
+		sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
 	}
 
 	buf->f_type = EXT4_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
-	buf->f_blocks = ext4_blocks_count(es) - overhead;
+	buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
 	buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
+	es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
 	buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
 	if (buf->f_bfree < ext4_r_blocks_count(es))
 		buf->f_bavail = 0;
 	buf->f_files = le32_to_cpu(es->s_inodes_count);
 	buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
+	es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
 	buf->f_namelen = EXT4_NAME_LEN;
 	fsid = le64_to_cpup((void *)es->s_uuid) ^
 	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
diff --git a/include/linux/ext4_fs_sb.h b/include/linux/ext4_fs_sb.h
index 691a713139ce..2347557a327a 100644
--- a/include/linux/ext4_fs_sb.h
+++ b/include/linux/ext4_fs_sb.h
@@ -39,6 +39,8 @@ struct ext4_sb_info {
 	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
 	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
 	unsigned long s_groups_count;	/* Number of groups in the fs */
+	unsigned long s_overhead_last;  /* Last calculated overhead */
+	unsigned long s_blocks_last;    /* Last seen block count */
 	struct buffer_head * s_sbh;	/* Buffer containing the super block */
 	struct ext4_super_block * s_es;	/* Pointer to the super block in the buffer */
 	struct buffer_head ** s_group_desc;
-- 
cgit v1.3-8-gc7d7


From 2e27afb300b56d83bb03fbfa68852b9c1e2920c6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.linux-foundation.org>
Date: Mon, 16 Jul 2007 14:31:08 -0700
Subject: Revert "[NET]: Fix races in net_rx_action vs netpoll."

This reverts commit 29578624e354f56143d92510fff33a8b2aaa2c03.

Ingo Molnar reports complete breakage with his e1000 card (no
networking, card reports transmit timeouts), and bisected it down to
this commit.  Let's figure out what went wrong, but not keep breaking
machines until we do.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Olaf Kirch <olaf.kirch@oracle.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/netdevice.h | 10 ----------
 net/core/netpoll.c        |  8 --------
 2 files changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 322b5eae57dd..da7a13c97eb8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -262,8 +262,6 @@ enum netdev_state_t
 	__LINK_STATE_LINKWATCH_PENDING,
 	__LINK_STATE_DORMANT,
 	__LINK_STATE_QDISC_RUNNING,
-	/* Set by the netpoll NAPI code */
-	__LINK_STATE_POLL_LIST_FROZEN,
 };
 
 
@@ -1022,14 +1020,6 @@ static inline void netif_rx_complete(struct net_device *dev)
 {
 	unsigned long flags;
 
-#ifdef CONFIG_NETPOLL
-	/* Prevent race with netpoll - yes, this is a kludge.
-	 * But at least it doesn't penalize the non-netpoll
-	 * code path. */
-	if (test_bit(__LINK_STATE_POLL_LIST_FROZEN, &dev->state))
-		return;
-#endif
-
 	local_irq_save(flags);
 	__netif_rx_complete(dev);
 	local_irq_restore(flags);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index d1264e9a50a8..de1b26aa5720 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -124,13 +124,6 @@ static void poll_napi(struct netpoll *np)
 	if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) &&
 	    npinfo->poll_owner != smp_processor_id() &&
 	    spin_trylock(&npinfo->poll_lock)) {
-		/* When calling dev->poll from poll_napi, we may end up in
-		 * netif_rx_complete. However, only the CPU to which the
-		 * device was queued is allowed to remove it from poll_list.
-		 * Setting POLL_LIST_FROZEN tells netif_rx_complete
-		 * to leave the NAPI state alone.
-		 */
-		set_bit(__LINK_STATE_POLL_LIST_FROZEN, &np->dev->state);
 		npinfo->rx_flags |= NETPOLL_RX_DROP;
 		atomic_inc(&trapped);
 
@@ -138,7 +131,6 @@ static void poll_napi(struct netpoll *np)
 
 		atomic_dec(&trapped);
 		npinfo->rx_flags &= ~NETPOLL_RX_DROP;
-		clear_bit(__LINK_STATE_POLL_LIST_FROZEN, &np->dev->state);
 		spin_unlock(&npinfo->poll_lock);
 	}
 }
-- 
cgit v1.3-8-gc7d7


From 13bd59a111760bb7cba8dcf17b6b55a0d99d3592 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Date: Tue, 17 Jul 2007 14:18:47 +0200
Subject: Don't define empty struct bsg_class_device if !CONFIG_BLK_DEV_BSG

Don't define an empty struct bsg_class_device if !CONFIG_BLK_DEV_BSG.

It's embedded in struct request_queue, but there we have

#if defined(CONFIG_BLK_DEV_BSG)
	struct bsg_class_device bsg_dev;
#endif

anyway.

Signed-off-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/bsg.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bsg.h b/include/linux/bsg.h
index bd998ca6cb2e..8547b10c388b 100644
--- a/include/linux/bsg.h
+++ b/include/linux/bsg.h
@@ -60,7 +60,6 @@ struct bsg_class_device {
 extern int bsg_register_queue(struct request_queue *, const char *);
 extern void bsg_unregister_queue(struct request_queue *);
 #else
-struct bsg_class_device { };
 #define bsg_register_queue(disk, name)		(0)
 #define bsg_unregister_queue(disk)	do { } while (0)
 #endif
-- 
cgit v1.3-8-gc7d7


From 769848c03895b63e5662eb7e4ec8c4866f7d0183 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 17 Jul 2007 04:03:05 -0700
Subject: Add __GFP_MOVABLE for callers to flag allocations from high memory
 that may be migrated

It is often known at allocation time whether a page may be migrated or not.
This patch adds a flag called __GFP_MOVABLE and a new mask called
GFP_HIGH_MOVABLE.  Allocations using the __GFP_MOVABLE can be either migrated
using the page migration mechanism or reclaimed by syncing with backing
storage and discarding.

An API function very similar to alloc_zeroed_user_highpage() is added for
__GFP_MOVABLE allocations called alloc_zeroed_user_highpage_movable().  The
flags used by alloc_zeroed_user_highpage() are not changed because it would
change the semantics of an existing API.  After this patch is applied there
are no in-kernel users of alloc_zeroed_user_highpage() so it probably should
be marked deprecated if this patch is merged.

Note that this patch includes a minor cleanup to the use of __GFP_ZERO in
shmem.c to keep all flag modifications to inode->mapping in the
shmem_dir_alloc() helper function.  This clean-up suggestion is courtesy of
Hugh Dickens.

Additional credit goes to Christoph Lameter and Linus Torvalds for shaping the
concept.  Credit to Hugh Dickens for catching issues with shmem swap vector
and ramfs allocations.

[akpm@linux-foundation.org: build fix]
[hugh@veritas.com: __GFP_ZERO cleanup]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c                  |  2 +-
 fs/inode.c                   | 10 +++++++--
 fs/ramfs/inode.c             |  1 +
 include/asm-alpha/page.h     |  3 ++-
 include/asm-cris/page.h      |  3 ++-
 include/asm-h8300/page.h     |  3 ++-
 include/asm-i386/page.h      |  3 ++-
 include/asm-ia64/page.h      | 13 +++++------
 include/asm-m32r/page.h      |  3 ++-
 include/asm-m68knommu/page.h |  3 ++-
 include/asm-s390/page.h      |  3 ++-
 include/asm-x86_64/page.h    |  3 ++-
 include/linux/gfp.h          | 16 +++++++++++++-
 include/linux/highmem.h      | 51 ++++++++++++++++++++++++++++++++++++++++++--
 mm/memory.c                  |  9 ++++----
 mm/mempolicy.c               |  5 +++--
 mm/migrate.c                 |  3 ++-
 mm/shmem.c                   |  7 ++++--
 mm/swap_state.c              |  3 ++-
 19 files changed, 114 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 424165b569f8..94344b2e0b46 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -982,7 +982,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	struct buffer_head *bh;
 
 	page = find_or_create_page(inode->i_mapping, index,
-		mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
 	if (!page)
 		return NULL;
 
diff --git a/fs/inode.c b/fs/inode.c
index 9a012cc5b6cd..47b87b071de3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -145,7 +145,7 @@ static struct inode *alloc_inode(struct super_block *sb)
 		mapping->a_ops = &empty_aops;
  		mapping->host = inode;
 		mapping->flags = 0;
-		mapping_set_gfp_mask(mapping, GFP_HIGHUSER);
+		mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
 		mapping->assoc_mapping = NULL;
 		mapping->backing_dev_info = &default_backing_dev_info;
 
@@ -519,7 +519,13 @@ repeat:
  *	new_inode 	- obtain an inode
  *	@sb: superblock
  *
- *	Allocates a new inode for given superblock.
+ *	Allocates a new inode for given superblock. The default gfp_mask
+ *	for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE.
+ *	If HIGHMEM pages are unsuitable or it is known that pages allocated
+ *	for the page cache are not reclaimable or migratable,
+ *	mapping_set_gfp_mask() must be called with suitable flags on the
+ *	newly created inode's mapping
+ *
  */
 struct inode *new_inode(struct super_block *sb)
 {
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index d40d22b347b7..ef2b46d099ff 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -60,6 +60,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 		inode->i_blocks = 0;
 		inode->i_mapping->a_ops = &ramfs_aops;
 		inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
+		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		switch (mode & S_IFMT) {
 		default:
diff --git a/include/asm-alpha/page.h b/include/asm-alpha/page.h
index d2bed3cb33ff..bae7f05716d4 100644
--- a/include/asm-alpha/page.h
+++ b/include/asm-alpha/page.h
@@ -17,7 +17,8 @@
 extern void clear_page(void *page);
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 
-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vmaddr)
+#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
+	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vmaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 
 extern void copy_page(void * _to, void * _from);
diff --git a/include/asm-cris/page.h b/include/asm-cris/page.h
index 9f13c32552bf..0648e3153f81 100644
--- a/include/asm-cris/page.h
+++ b/include/asm-cris/page.h
@@ -20,7 +20,8 @@
 #define clear_user_page(page, vaddr, pg)    clear_page(page)
 #define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
 
-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
+	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 
 /*
diff --git a/include/asm-h8300/page.h b/include/asm-h8300/page.h
index 3b4f2903f91d..c8cc81a3aca5 100644
--- a/include/asm-h8300/page.h
+++ b/include/asm-h8300/page.h
@@ -22,7 +22,8 @@
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
+	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 
 /*
diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h
index 818ac8bf01e2..99cf5d3692a9 100644
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -34,7 +34,8 @@
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
+	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 
 /*
diff --git a/include/asm-ia64/page.h b/include/asm-ia64/page.h
index 485759ba9e36..d6345464a2b3 100644
--- a/include/asm-ia64/page.h
+++ b/include/asm-ia64/page.h
@@ -87,12 +87,13 @@ do {						\
 } while (0)
 
 
-#define alloc_zeroed_user_highpage(vma, vaddr) \
-({						\
-	struct page *page = alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr); \
-	if (page)				\
- 		flush_dcache_page(page);	\
-	page;					\
+#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr)		\
+({									\
+	struct page *page = alloc_page_vma(				\
+		GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr);	\
+	if (page)							\
+ 		flush_dcache_page(page);				\
+	page;								\
 })
 
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
diff --git a/include/asm-m32r/page.h b/include/asm-m32r/page.h
index 6f6ecf7d14a3..04fd183a2c58 100644
--- a/include/asm-m32r/page.h
+++ b/include/asm-m32r/page.h
@@ -15,7 +15,8 @@ extern void copy_page(void *to, void *from);
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
+	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 
 /*
diff --git a/include/asm-m68knommu/page.h b/include/asm-m68knommu/page.h
index 2a1b8bdcb29c..9efa0a9851b1 100644
--- a/include/asm-m68knommu/page.h
+++ b/include/asm-m68knommu/page.h
@@ -22,7 +22,8 @@
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
+	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 
 /*
diff --git a/include/asm-s390/page.h b/include/asm-s390/page.h
index 05ea6f172786..f326451ed6ec 100644
--- a/include/asm-s390/page.h
+++ b/include/asm-s390/page.h
@@ -64,7 +64,8 @@ static inline void copy_page(void *to, void *from)
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
+	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 
 /*
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index e327c830da0c..88adf1afb0a2 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -48,7 +48,8 @@ void copy_page(void *, void *);
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
+	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 /*
  * These are used to make use of C type-checking..
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0d2ef0b082a6..e5882fe49f83 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -30,6 +30,9 @@ struct vm_area_struct;
  * cannot handle allocation failures.
  *
  * __GFP_NORETRY: The VM implementation must not retry indefinitely.
+ *
+ * __GFP_MOVABLE: Flag that this page will be movable by the page migration
+ * mechanism or reclaimed
  */
 #define __GFP_WAIT	((__force gfp_t)0x10u)	/* Can wait and reschedule? */
 #define __GFP_HIGH	((__force gfp_t)0x20u)	/* Should access emergency pools? */
@@ -45,6 +48,7 @@ struct vm_area_struct;
 #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
 #define __GFP_HARDWALL   ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
 #define __GFP_THISNODE	((__force gfp_t)0x40000u)/* No fallback, no policies */
+#define __GFP_MOVABLE	((__force gfp_t)0x80000u) /* Page is movable */
 
 #define __GFP_BITS_SHIFT 20	/* Room for 20 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
@@ -53,7 +57,8 @@ struct vm_area_struct;
 #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
 			__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
 			__GFP_NOFAIL|__GFP_NORETRY|__GFP_COMP| \
-			__GFP_NOMEMALLOC|__GFP_HARDWALL|__GFP_THISNODE)
+			__GFP_NOMEMALLOC|__GFP_HARDWALL|__GFP_THISNODE| \
+			__GFP_MOVABLE)
 
 /* This equals 0, but use constants in case they ever change */
 #define GFP_NOWAIT	(GFP_ATOMIC & ~__GFP_HIGH)
@@ -65,6 +70,15 @@ struct vm_area_struct;
 #define GFP_USER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
 #define GFP_HIGHUSER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \
 			 __GFP_HIGHMEM)
+#define GFP_HIGHUSER_MOVABLE	(__GFP_WAIT | __GFP_IO | __GFP_FS | \
+				 __GFP_HARDWALL | __GFP_HIGHMEM | \
+				 __GFP_MOVABLE)
+#define GFP_NOFS_PAGECACHE	(__GFP_WAIT | __GFP_IO | __GFP_MOVABLE)
+#define GFP_USER_PAGECACHE	(__GFP_WAIT | __GFP_IO | __GFP_FS | \
+				 __GFP_HARDWALL | __GFP_MOVABLE)
+#define GFP_HIGHUSER_PAGECACHE	(__GFP_WAIT | __GFP_IO | __GFP_FS | \
+				 __GFP_HARDWALL | __GFP_HIGHMEM | \
+				 __GFP_MOVABLE)
 
 #ifdef CONFIG_NUMA
 #define GFP_THISNODE	(__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 98e2cce996a4..12c5e4e3135a 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -73,10 +73,27 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
 }
 
 #ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+/**
+ * __alloc_zeroed_user_highpage - Allocate a zeroed HIGHMEM page for a VMA with caller-specified movable GFP flags
+ * @movableflags: The GFP flags related to the pages future ability to move like __GFP_MOVABLE
+ * @vma: The VMA the page is to be allocated for
+ * @vaddr: The virtual address the page will be inserted into
+ *
+ * This function will allocate a page for a VMA but the caller is expected
+ * to specify via movableflags whether the page will be movable in the
+ * future or not
+ *
+ * An architecture may override this function by defining
+ * __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE and providing their own
+ * implementation.
+ */
 static inline struct page *
-alloc_zeroed_user_highpage(struct vm_area_struct *vma, unsigned long vaddr)
+__alloc_zeroed_user_highpage(gfp_t movableflags,
+			struct vm_area_struct *vma,
+			unsigned long vaddr)
 {
-	struct page *page = alloc_page_vma(GFP_HIGHUSER, vma, vaddr);
+	struct page *page = alloc_page_vma(GFP_HIGHUSER | movableflags,
+			vma, vaddr);
 
 	if (page)
 		clear_user_highpage(page, vaddr);
@@ -85,6 +102,36 @@ alloc_zeroed_user_highpage(struct vm_area_struct *vma, unsigned long vaddr)
 }
 #endif
 
+/**
+ * alloc_zeroed_user_highpage - Allocate a zeroed HIGHMEM page for a VMA
+ * @vma: The VMA the page is to be allocated for
+ * @vaddr: The virtual address the page will be inserted into
+ *
+ * This function will allocate a page for a VMA that the caller knows will
+ * not be able to move in the future using move_pages() or reclaim. If it
+ * is known that the page can move, use alloc_zeroed_user_highpage_movable
+ */
+static inline struct page *
+alloc_zeroed_user_highpage(struct vm_area_struct *vma, unsigned long vaddr)
+{
+	return __alloc_zeroed_user_highpage(0, vma, vaddr);
+}
+
+/**
+ * alloc_zeroed_user_highpage_movable - Allocate a zeroed HIGHMEM page for a VMA that the caller knows can move
+ * @vma: The VMA the page is to be allocated for
+ * @vaddr: The virtual address the page will be inserted into
+ *
+ * This function will allocate a page for a VMA that the caller knows will
+ * be able to migrate in the future using move_pages() or reclaimed
+ */
+static inline struct page *
+alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
+					unsigned long vaddr)
+{
+	return __alloc_zeroed_user_highpage(__GFP_MOVABLE, vma, vaddr);
+}
+
 static inline void clear_highpage(struct page *page)
 {
 	void *kaddr = kmap_atomic(page, KM_USER0);
diff --git a/mm/memory.c b/mm/memory.c
index b3d73bb1f680..9c6ff7fffdc8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1715,11 +1715,11 @@ gotten:
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
 	if (old_page == ZERO_PAGE(address)) {
-		new_page = alloc_zeroed_user_highpage(vma, address);
+		new_page = alloc_zeroed_user_highpage_movable(vma, address);
 		if (!new_page)
 			goto oom;
 	} else {
-		new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 		if (!new_page)
 			goto oom;
 		cow_user_page(new_page, old_page, address, vma);
@@ -2237,7 +2237,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 		if (unlikely(anon_vma_prepare(vma)))
 			goto oom;
-		page = alloc_zeroed_user_highpage(vma, address);
+		page = alloc_zeroed_user_highpage_movable(vma, address);
 		if (!page)
 			goto oom;
 
@@ -2340,7 +2340,8 @@ retry:
 
 			if (unlikely(anon_vma_prepare(vma)))
 				goto oom;
-			page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+						vma, address);
 			if (!page)
 				goto oom;
 			copy_user_highpage(page, new_page, address, vma);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 188f8d9c4aed..4c0f99996811 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -594,7 +594,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 
 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 {
-	return alloc_pages_node(node, GFP_HIGHUSER, 0);
+	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
 }
 
 /*
@@ -710,7 +710,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
 {
 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
 
-	return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
+	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
+					page_address_in_vma(page, vma));
 }
 #else
 
diff --git a/mm/migrate.c b/mm/migrate.c
index a91ca00abebe..34d8ada053e4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -761,7 +761,8 @@ static struct page *new_page_node(struct page *p, unsigned long private,
 
 	*result = &pm->status;
 
-	return alloc_pages_node(pm->node, GFP_HIGHUSER | GFP_THISNODE, 0);
+	return alloc_pages_node(pm->node,
+				GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
 }
 
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index 0493e4d0bcaa..e49181d9d893 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -93,8 +93,11 @@ static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
 	 * The above definition of ENTRIES_PER_PAGE, and the use of
 	 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
 	 * might be reconsidered if it ever diverges from PAGE_SIZE.
+	 *
+	 * __GFP_MOVABLE is masked out as swap vectors cannot move
 	 */
-	return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+	return alloc_pages((gfp_mask & ~__GFP_MOVABLE) | __GFP_ZERO,
+				PAGE_CACHE_SHIFT-PAGE_SHIFT);
 }
 
 static inline void shmem_dir_free(struct page *page)
@@ -372,7 +375,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
 		}
 
 		spin_unlock(&info->lock);
-		page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
+		page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
 		if (page)
 			set_page_private(page, 0);
 		spin_lock(&info->lock);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 925d5c50f18d..67daecb6031a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -334,7 +334,8 @@ struct page *read_swap_cache_async(swp_entry_t entry,
 		 * Get a new page to read into from swap.
 		 */
 		if (!new_page) {
-			new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+			new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+								vma, addr);
 			if (!new_page)
 				break;		/* Out of memory */
 		}
-- 
cgit v1.3-8-gc7d7


From 2a1e274acf0b1c192face19a4be7c12d4503eaaf Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 17 Jul 2007 04:03:12 -0700
Subject: Create the ZONE_MOVABLE zone

The following 8 patches against 2.6.20-mm2 create a zone called ZONE_MOVABLE
that is only usable by allocations that specify both __GFP_HIGHMEM and
__GFP_MOVABLE.  This has the effect of keeping all non-movable pages within a
single memory partition while allowing movable allocations to be satisfied
from either partition.  The patches may be applied with the list-based
anti-fragmentation patches that groups pages together based on mobility.

The size of the zone is determined by a kernelcore= parameter specified at
boot-time.  This specifies how much memory is usable by non-movable
allocations and the remainder is used for ZONE_MOVABLE.  Any range of pages
within ZONE_MOVABLE can be released by migrating the pages or by reclaiming.

When selecting a zone to take pages from for ZONE_MOVABLE, there are two
things to consider.  First, only memory from the highest populated zone is
used for ZONE_MOVABLE.  On the x86, this is probably going to be ZONE_HIGHMEM
but it would be ZONE_DMA on ppc64 or possibly ZONE_DMA32 on x86_64.  Second,
the amount of memory usable by the kernel will be spread evenly throughout
NUMA nodes where possible.  If the nodes are not of equal size, the amount of
memory usable by the kernel on some nodes may be greater than others.

By default, the zone is not as useful for hugetlb allocations because they are
pinned and non-migratable (currently at least).  A sysctl is provided that
allows huge pages to be allocated from that zone.  This means that the huge
page pool can be resized to the size of ZONE_MOVABLE during the lifetime of
the system assuming that pages are not mlocked.  Despite huge pages being
non-movable, we do not introduce additional external fragmentation of note as
huge pages are always the largest contiguous block we care about.

Credit goes to Andy Whitcroft for catching a large variety of problems during
review of the patches.

This patch creates an additional zone, ZONE_MOVABLE.  This zone is only usable
by allocations which specify both __GFP_HIGHMEM and __GFP_MOVABLE.  Hot-added
memory continues to be placed in their existing destination as there is no
mechanism to redirect them to a specific zone.

[y-goto@jp.fujitsu.com: Fix section mismatch of memory hotplug related code]
[akpm@linux-foundation.org: various fixes]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h    |   3 +
 include/linux/mm.h     |   1 +
 include/linux/mmzone.h |  20 ++++-
 include/linux/vmstat.h |   5 +-
 mm/highmem.c           |   7 +-
 mm/page_alloc.c        | 232 ++++++++++++++++++++++++++++++++++++++++++++++++-
 mm/vmstat.c            |   2 +-
 7 files changed, 260 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index e5882fe49f83..bc68dd9a6d41 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -106,6 +106,9 @@ static inline enum zone_type gfp_zone(gfp_t flags)
 	if (flags & __GFP_DMA32)
 		return ZONE_DMA32;
 #endif
+	if ((flags & (__GFP_HIGHMEM | __GFP_MOVABLE)) ==
+			(__GFP_HIGHMEM | __GFP_MOVABLE))
+		return ZONE_MOVABLE;
 #ifdef CONFIG_HIGHMEM
 	if (flags & __GFP_HIGHMEM)
 		return ZONE_HIGHMEM;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 97d0cddfd223..857e44817178 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1005,6 +1005,7 @@ extern unsigned long find_max_pfn_with_active_regions(void);
 extern void free_bootmem_with_active_regions(int nid,
 						unsigned long max_low_pfn);
 extern void sparse_memory_present_with_active_regions(int nid);
+extern int cmdline_parse_kernelcore(char *p);
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 extern int early_pfn_to_nid(unsigned long pfn);
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 04b1636a970b..d71ff763c9df 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -146,6 +146,7 @@ enum zone_type {
 	 */
 	ZONE_HIGHMEM,
 #endif
+	ZONE_MOVABLE,
 	MAX_NR_ZONES
 };
 
@@ -167,6 +168,7 @@ enum zone_type {
 	+ defined(CONFIG_ZONE_DMA32)	\
 	+ 1				\
 	+ defined(CONFIG_HIGHMEM)	\
+	+ 1				\
 )
 #if __ZONE_COUNT < 2
 #define ZONES_SHIFT 0
@@ -499,10 +501,22 @@ static inline int populated_zone(struct zone *zone)
 	return (!!zone->present_pages);
 }
 
+extern int movable_zone;
+
+static inline int zone_movable_is_highmem(void)
+{
+#if defined(CONFIG_HIGHMEM) && defined(CONFIG_ARCH_POPULATES_NODE_MAP)
+	return movable_zone == ZONE_HIGHMEM;
+#else
+	return 0;
+#endif
+}
+
 static inline int is_highmem_idx(enum zone_type idx)
 {
 #ifdef CONFIG_HIGHMEM
-	return (idx == ZONE_HIGHMEM);
+	return (idx == ZONE_HIGHMEM ||
+		(idx == ZONE_MOVABLE && zone_movable_is_highmem()));
 #else
 	return 0;
 #endif
@@ -522,7 +536,9 @@ static inline int is_normal_idx(enum zone_type idx)
 static inline int is_highmem(struct zone *zone)
 {
 #ifdef CONFIG_HIGHMEM
-	return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM;
+	int zone_idx = zone - zone->zone_pgdat->node_zones;
+	return zone_idx == ZONE_HIGHMEM ||
+		(zone_idx == ZONE_MOVABLE && zone_movable_is_highmem());
 #else
 	return 0;
 #endif
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index d9325cf8a134..75370ec0923e 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -25,7 +25,7 @@
 #define HIGHMEM_ZONE(xx)
 #endif
 
-#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL HIGHMEM_ZONE(xx)
+#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL HIGHMEM_ZONE(xx) , xx##_MOVABLE
 
 enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		FOR_ALL_ZONES(PGALLOC),
@@ -170,7 +170,8 @@ static inline unsigned long node_page_state(int node,
 #ifdef CONFIG_HIGHMEM
 		zone_page_state(&zones[ZONE_HIGHMEM], item) +
 #endif
-		zone_page_state(&zones[ZONE_NORMAL], item);
+		zone_page_state(&zones[ZONE_NORMAL], item) +
+		zone_page_state(&zones[ZONE_MOVABLE], item);
 }
 
 extern void zone_statistics(struct zonelist *, struct zone *);
diff --git a/mm/highmem.c b/mm/highmem.c
index be8f8d36a8b9..7a967bc35152 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -46,9 +46,14 @@ unsigned int nr_free_highpages (void)
 	pg_data_t *pgdat;
 	unsigned int pages = 0;
 
-	for_each_online_pgdat(pgdat)
+	for_each_online_pgdat(pgdat) {
 		pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
 			NR_FREE_PAGES);
+		if (zone_movable_is_highmem())
+			pages += zone_page_state(
+					&pgdat->node_zones[ZONE_MOVABLE],
+					NR_FREE_PAGES);
+	}
 
 	return pages;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f9e4e647d7e8..c3f6f851f76e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -80,8 +80,9 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 	 256,
 #endif
 #ifdef CONFIG_HIGHMEM
-	 32
+	 32,
 #endif
+	 32,
 };
 
 EXPORT_SYMBOL(totalram_pages);
@@ -95,8 +96,9 @@ static char * const zone_names[MAX_NR_ZONES] = {
 #endif
 	 "Normal",
 #ifdef CONFIG_HIGHMEM
-	 "HighMem"
+	 "HighMem",
 #endif
+	 "Movable",
 };
 
 int min_free_kbytes = 1024;
@@ -134,6 +136,12 @@ static unsigned long __meminitdata dma_reserve;
   static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
   static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
+  unsigned long __initdata required_kernelcore;
+  unsigned long __initdata zone_movable_pfn[MAX_NUMNODES];
+
+  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
+  int movable_zone;
+  EXPORT_SYMBOL(movable_zone);
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 
 #if MAX_NUMNODES > 1
@@ -1480,7 +1488,7 @@ unsigned int nr_free_buffer_pages(void)
  */
 unsigned int nr_free_pagecache_pages(void)
 {
-	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
+	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
 }
 
 static inline void show_node(struct zone *zone)
@@ -2666,6 +2674,63 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
 	account_node_boundary(nid, start_pfn, end_pfn);
 }
 
+/*
+ * This finds a zone that can be used for ZONE_MOVABLE pages. The
+ * assumption is made that zones within a node are ordered in monotonic
+ * increasing memory addresses so that the "highest" populated zone is used
+ */
+void __init find_usable_zone_for_movable(void)
+{
+	int zone_index;
+	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
+		if (zone_index == ZONE_MOVABLE)
+			continue;
+
+		if (arch_zone_highest_possible_pfn[zone_index] >
+				arch_zone_lowest_possible_pfn[zone_index])
+			break;
+	}
+
+	VM_BUG_ON(zone_index == -1);
+	movable_zone = zone_index;
+}
+
+/*
+ * The zone ranges provided by the architecture do not include ZONE_MOVABLE
+ * because it is sized independant of architecture. Unlike the other zones,
+ * the starting point for ZONE_MOVABLE is not fixed. It may be different
+ * in each node depending on the size of each node and how evenly kernelcore
+ * is distributed. This helper function adjusts the zone ranges
+ * provided by the architecture for a given node by using the end of the
+ * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
+ * zones within a node are in order of monotonic increases memory addresses
+ */
+void __meminit adjust_zone_range_for_zone_movable(int nid,
+					unsigned long zone_type,
+					unsigned long node_start_pfn,
+					unsigned long node_end_pfn,
+					unsigned long *zone_start_pfn,
+					unsigned long *zone_end_pfn)
+{
+	/* Only adjust if ZONE_MOVABLE is on this node */
+	if (zone_movable_pfn[nid]) {
+		/* Size ZONE_MOVABLE */
+		if (zone_type == ZONE_MOVABLE) {
+			*zone_start_pfn = zone_movable_pfn[nid];
+			*zone_end_pfn = min(node_end_pfn,
+				arch_zone_highest_possible_pfn[movable_zone]);
+
+		/* Adjust for ZONE_MOVABLE starting within this range */
+		} else if (*zone_start_pfn < zone_movable_pfn[nid] &&
+				*zone_end_pfn > zone_movable_pfn[nid]) {
+			*zone_end_pfn = zone_movable_pfn[nid];
+
+		/* Check if this whole range is within ZONE_MOVABLE */
+		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
+			*zone_start_pfn = *zone_end_pfn;
+	}
+}
+
 /*
  * Return the number of pages a zone spans in a node, including holes
  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
@@ -2681,6 +2746,9 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
 	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
 	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+	adjust_zone_range_for_zone_movable(nid, zone_type,
+				node_start_pfn, node_end_pfn,
+				&zone_start_pfn, &zone_end_pfn);
 
 	/* Check that this node has pages within the zone's required range */
 	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
@@ -2771,6 +2839,9 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
 	zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
 							node_end_pfn);
 
+	adjust_zone_range_for_zone_movable(nid, zone_type,
+			node_start_pfn, node_end_pfn,
+			&zone_start_pfn, &zone_end_pfn);
 	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
 
@@ -3148,6 +3219,122 @@ unsigned long __init find_max_pfn_with_active_regions(void)
 	return max_pfn;
 }
 
+/*
+ * Find the PFN the Movable zone begins in each node. Kernel memory
+ * is spread evenly between nodes as long as the nodes have enough
+ * memory. When they don't, some nodes will have more kernelcore than
+ * others
+ */
+void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
+{
+	int i, nid;
+	unsigned long usable_startpfn;
+	unsigned long kernelcore_node, kernelcore_remaining;
+	int usable_nodes = num_online_nodes();
+
+	/* If kernelcore was not specified, there is no ZONE_MOVABLE */
+	if (!required_kernelcore)
+		return;
+
+	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
+	find_usable_zone_for_movable();
+	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
+
+restart:
+	/* Spread kernelcore memory as evenly as possible throughout nodes */
+	kernelcore_node = required_kernelcore / usable_nodes;
+	for_each_online_node(nid) {
+		/*
+		 * Recalculate kernelcore_node if the division per node
+		 * now exceeds what is necessary to satisfy the requested
+		 * amount of memory for the kernel
+		 */
+		if (required_kernelcore < kernelcore_node)
+			kernelcore_node = required_kernelcore / usable_nodes;
+
+		/*
+		 * As the map is walked, we track how much memory is usable
+		 * by the kernel using kernelcore_remaining. When it is
+		 * 0, the rest of the node is usable by ZONE_MOVABLE
+		 */
+		kernelcore_remaining = kernelcore_node;
+
+		/* Go through each range of PFNs within this node */
+		for_each_active_range_index_in_nid(i, nid) {
+			unsigned long start_pfn, end_pfn;
+			unsigned long size_pages;
+
+			start_pfn = max(early_node_map[i].start_pfn,
+						zone_movable_pfn[nid]);
+			end_pfn = early_node_map[i].end_pfn;
+			if (start_pfn >= end_pfn)
+				continue;
+
+			/* Account for what is only usable for kernelcore */
+			if (start_pfn < usable_startpfn) {
+				unsigned long kernel_pages;
+				kernel_pages = min(end_pfn, usable_startpfn)
+								- start_pfn;
+
+				kernelcore_remaining -= min(kernel_pages,
+							kernelcore_remaining);
+				required_kernelcore -= min(kernel_pages,
+							required_kernelcore);
+
+				/* Continue if range is now fully accounted */
+				if (end_pfn <= usable_startpfn) {
+
+					/*
+					 * Push zone_movable_pfn to the end so
+					 * that if we have to rebalance
+					 * kernelcore across nodes, we will
+					 * not double account here
+					 */
+					zone_movable_pfn[nid] = end_pfn;
+					continue;
+				}
+				start_pfn = usable_startpfn;
+			}
+
+			/*
+			 * The usable PFN range for ZONE_MOVABLE is from
+			 * start_pfn->end_pfn. Calculate size_pages as the
+			 * number of pages used as kernelcore
+			 */
+			size_pages = end_pfn - start_pfn;
+			if (size_pages > kernelcore_remaining)
+				size_pages = kernelcore_remaining;
+			zone_movable_pfn[nid] = start_pfn + size_pages;
+
+			/*
+			 * Some kernelcore has been met, update counts and
+			 * break if the kernelcore for this node has been
+			 * satisified
+			 */
+			required_kernelcore -= min(required_kernelcore,
+								size_pages);
+			kernelcore_remaining -= size_pages;
+			if (!kernelcore_remaining)
+				break;
+		}
+	}
+
+	/*
+	 * If there is still required_kernelcore, we do another pass with one
+	 * less node in the count. This will push zone_movable_pfn[nid] further
+	 * along on the nodes that still have memory until kernelcore is
+	 * satisified
+	 */
+	usable_nodes--;
+	if (usable_nodes && required_kernelcore > usable_nodes)
+		goto restart;
+
+	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
+	for (nid = 0; nid < MAX_NUMNODES; nid++)
+		zone_movable_pfn[nid] =
+			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+}
+
 /**
  * free_area_init_nodes - Initialise all pg_data_t and zone data
  * @max_zone_pfn: an array of max PFNs for each zone
@@ -3177,19 +3364,37 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
 	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
 	for (i = 1; i < MAX_NR_ZONES; i++) {
+		if (i == ZONE_MOVABLE)
+			continue;
 		arch_zone_lowest_possible_pfn[i] =
 			arch_zone_highest_possible_pfn[i-1];
 		arch_zone_highest_possible_pfn[i] =
 			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
 	}
+	arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
+	arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
+
+	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
+	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
+	find_zone_movable_pfns_for_nodes(zone_movable_pfn);
 
 	/* Print out the zone ranges */
 	printk("Zone PFN ranges:\n");
-	for (i = 0; i < MAX_NR_ZONES; i++)
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		if (i == ZONE_MOVABLE)
+			continue;
 		printk("  %-8s %8lu -> %8lu\n",
 				zone_names[i],
 				arch_zone_lowest_possible_pfn[i],
 				arch_zone_highest_possible_pfn[i]);
+	}
+
+	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
+	printk("Movable zone start PFN for each node\n");
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		if (zone_movable_pfn[i])
+			printk("  Node %d: %lu\n", i, zone_movable_pfn[i]);
+	}
 
 	/* Print out the early_node_map[] */
 	printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
@@ -3206,6 +3411,25 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 				find_min_pfn_for_node(nid), NULL);
 	}
 }
+
+/*
+ * kernelcore=size sets the amount of memory for use for allocations that
+ * cannot be reclaimed or migrated.
+ */
+int __init cmdline_parse_kernelcore(char *p)
+{
+	unsigned long long coremem;
+	if (!p)
+		return -EINVAL;
+
+	coremem = memparse(p, &p);
+	required_kernelcore = coremem >> PAGE_SHIFT;
+
+	/* Paranoid check that UL is enough for required_kernelcore */
+	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
+
+	return 0;
+}
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 
 /**
diff --git a/mm/vmstat.c b/mm/vmstat.c
index eceaf496210f..fadf791cd7e6 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -472,7 +472,7 @@ const struct seq_operations fragmentation_op = {
 #endif
 
 #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
-					TEXT_FOR_HIGHMEM(xx)
+					TEXT_FOR_HIGHMEM(xx) xx "_movable",
 
 static const char * const vmstat_text[] = {
 	/* Zoned VM counters */
-- 
cgit v1.3-8-gc7d7


From 396faf0303d273219db5d7eb4a2879ad977ed185 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 17 Jul 2007 04:03:13 -0700
Subject: Allow huge page allocations to use GFP_HIGH_MOVABLE

Huge pages are not movable so are not allocated from ZONE_MOVABLE.  However,
as ZONE_MOVABLE will always have pages that can be migrated or reclaimed, it
can be used to satisfy hugepage allocations even when the system has been
running a long time.  This allows an administrator to resize the hugepage pool
at runtime depending on the size of ZONE_MOVABLE.

This patch adds a new sysctl called hugepages_treat_as_movable.  When a
non-zero value is written to it, future allocations for the huge page pool
will use ZONE_MOVABLE.  Despite huge pages being non-movable, we do not
introduce additional external fragmentation of note as huge pages are always
the largest contiguous block we care about.

[akpm@linux-foundation.org: various fixes]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h   |  2 ++
 include/linux/mempolicy.h |  6 +++---
 kernel/sysctl.c           |  8 ++++++++
 mm/hugetlb.c              | 23 ++++++++++++++++++++---
 mm/mempolicy.c            |  5 +++--
 5 files changed, 36 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 2c13715e9dde..49b7053043ad 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -15,6 +15,7 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 }
 
 int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
+int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int);
 void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
@@ -29,6 +30,7 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 
 extern unsigned long max_huge_pages;
+extern unsigned long hugepages_treat_as_movable;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
 extern int sysctl_hugetlb_shm_group;
 
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index daabb3aa1ec6..e147cf50529f 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -159,7 +159,7 @@ extern void mpol_fix_fork_child_flag(struct task_struct *p);
 
 extern struct mempolicy default_policy;
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
-		unsigned long addr);
+		unsigned long addr, gfp_t gfp_flags);
 extern unsigned slab_node(struct mempolicy *policy);
 
 extern enum zone_type policy_zone;
@@ -256,9 +256,9 @@ static inline void mpol_fix_fork_child_flag(struct task_struct *p)
 #define set_cpuset_being_rebound(x) do {} while (0)
 
 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
-		unsigned long addr)
+		unsigned long addr, gfp_t gfp_flags)
 {
-	return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+	return NODE_DATA(0)->node_zonelists + gfp_zone(gfp_flags);
 }
 
 static inline int do_migrate_pages(struct mm_struct *mm,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2ce7acf841ae..48dae075d5c2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -826,6 +826,14 @@ static ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	 },
+	 {
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "hugepages_treat_as_movable",
+		.data		= &hugepages_treat_as_movable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &hugetlb_treat_movable_handler,
+	},
 #endif
 	{
 		.ctl_name	= VM_LOWMEM_RESERVE_RATIO,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index acc0fb3cf067..58980676b842 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -27,6 +27,9 @@ unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
+static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
+unsigned long hugepages_treat_as_movable;
+
 /*
  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
  */
@@ -68,12 +71,13 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 {
 	int nid;
 	struct page *page = NULL;
-	struct zonelist *zonelist = huge_zonelist(vma, address);
+	struct zonelist *zonelist = huge_zonelist(vma, address,
+						htlb_alloc_mask);
 	struct zone **z;
 
 	for (z = zonelist->zones; *z; z++) {
 		nid = zone_to_nid(*z);
-		if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) &&
+		if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) &&
 		    !list_empty(&hugepage_freelists[nid]))
 			break;
 	}
@@ -113,7 +117,7 @@ static int alloc_fresh_huge_page(void)
 	prev_nid = nid;
 	spin_unlock(&nid_lock);
 
-	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
+	page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
 					HUGETLB_PAGE_ORDER);
 	if (page) {
 		set_compound_page_dtor(page, free_huge_page);
@@ -263,6 +267,19 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 	max_huge_pages = set_max_huge_pages(max_huge_pages);
 	return 0;
 }
+
+int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
+			struct file *file, void __user *buffer,
+			size_t *length, loff_t *ppos)
+{
+	proc_dointvec(table, write, file, buffer, length, ppos);
+	if (hugepages_treat_as_movable)
+		htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
+	else
+		htlb_alloc_mask = GFP_HIGHUSER;
+	return 0;
+}
+
 #endif /* CONFIG_SYSCTL */
 
 int hugetlb_report_meminfo(char *buf)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4c0f99996811..9f4e9b95e8f2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1203,7 +1203,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 
 #ifdef CONFIG_HUGETLBFS
 /* Return a zonelist suitable for a huge page allocation. */
-struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
+struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
+							gfp_t gfp_flags)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 
@@ -1211,7 +1212,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
 		unsigned nid;
 
 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
-		return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+		return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
 	}
 	return zonelist_policy(GFP_HIGHUSER, pol);
 }
-- 
cgit v1.3-8-gc7d7


From ed7ed365172e27b0efe9d43cc962723c7193e34e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 17 Jul 2007 04:03:14 -0700
Subject: handle kernelcore=: generic

This patch adds the kernelcore= parameter for x86.

Once all patches are applied, a new command-line parameter exist and a new
sysctl.  This patch adds the necessary documentation.

From: Yasunori Goto <y-goto@jp.fujitsu.com>

  When "kernelcore" boot option is specified, kernel can't boot up on ia64
  because of an infinite loop.  In addition, the parsing code can be handled
  in an architecture-independent manner.

  This patch uses common code to handle the kernelcore= parameter.  It is
  only available to architectures that support arch-independent zone-sizing
  (i.e.  define CONFIG_ARCH_POPULATES_NODE_MAP).  Other architectures will
  ignore the boot parameter.

[bunk@stusta.de: make cmdline_parse_kernelcore() static]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt  | 15 +++++++++++++++
 Documentation/kernel-parameters.txt | 16 ++++++++++++++++
 Documentation/sysctl/vm.txt         |  3 ++-
 arch/ia64/kernel/efi.c              |  1 +
 include/linux/mm.h                  |  1 -
 mm/page_alloc.c                     |  5 ++++-
 6 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 460b892d089e..ebffdffb3d99 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1348,6 +1348,21 @@ nr_hugepages configures number of hugetlb page reserved for the system.
 hugetlb_shm_group contains group id that is allowed to create SysV shared
 memory segment using hugetlb page.
 
+hugepages_treat_as_movable
+--------------------------
+
+This parameter is only useful when kernelcore= is specified at boot time to
+create ZONE_MOVABLE for pages that may be reclaimed or migrated. Huge pages
+are not movable so are not normally allocated from ZONE_MOVABLE. A non-zero
+value written to hugepages_treat_as_movable allows huge pages to be allocated
+from ZONE_MOVABLE.
+
+Once enabled, the ZONE_MOVABLE is treated as an area of memory the huge
+pages pool can easily grow or shrink within. Assuming that applications are
+not running that mlock() a lot of memory, it is likely the huge pages pool
+can grow to the size of ZONE_MOVABLE by repeatedly entering the desired value
+into nr_hugepages and triggering page reclaim.
+
 laptop_mode
 -----------
 
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 8363ad3ba018..1794affbd06f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -817,6 +817,22 @@ and is between 256 and 4096 characters. It is defined in the file
 	js=		[HW,JOY] Analog joystick
 			See Documentation/input/joystick.txt.
 
+	kernelcore=nn[KMG]	[KNL,IA-32,IA-64,PPC,X86-64] This parameter
+			specifies the amount of memory usable by the kernel
+			for non-movable allocations.  The requested amount is
+			spread evenly throughout all nodes in the system. The
+			remaining memory in each node is used for Movable
+			pages. In the event, a node is too small to have both
+			kernelcore and Movable pages, kernelcore pages will
+			take priority and other nodes will have a larger number
+			of kernelcore pages.  The Movable zone is used for the
+			allocation of pages that may be reclaimed or moved
+			by the page migration subsystem.  This means that
+			HugeTLB pages may not be allocated from this zone.
+			Note that allocations like PTEs-from-HighMem still
+			use the HighMem zone if it exists, and the Normal
+			zone if it does not.
+
 	keepinitrd	[HW,ARM]
 
 	kstack=N	[IA-32,X86-64] Print N words from the kernel stack
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index df3ff2095f9d..a0ccc5b60260 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -38,7 +38,8 @@ Currently, these files are in /proc/sys/vm:
 
 dirty_ratio, dirty_background_ratio, dirty_expire_centisecs,
 dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode,
-block_dump, swap_token_timeout, drop-caches:
+block_dump, swap_token_timeout, drop-caches,
+hugepages_treat_as_movable:
 
 See Documentation/filesystems/proc.txt
 
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 75ec3478d8a2..73ca86d03810 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -28,6 +28,7 @@
 #include <linux/time.h>
 #include <linux/efi.h>
 #include <linux/kexec.h>
+#include <linux/mm.h>
 
 #include <asm/io.h>
 #include <asm/kregs.h>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 857e44817178..97d0cddfd223 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1005,7 +1005,6 @@ extern unsigned long find_max_pfn_with_active_regions(void);
 extern void free_bootmem_with_active_regions(int nid,
 						unsigned long max_low_pfn);
 extern void sparse_memory_present_with_active_regions(int nid);
-extern int cmdline_parse_kernelcore(char *p);
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 extern int early_pfn_to_nid(unsigned long pfn);
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c3f6f851f76e..0a53728a12f5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3416,7 +3416,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
  * kernelcore=size sets the amount of memory for use for allocations that
  * cannot be reclaimed or migrated.
  */
-int __init cmdline_parse_kernelcore(char *p)
+static int __init cmdline_parse_kernelcore(char *p)
 {
 	unsigned long long coremem;
 	if (!p)
@@ -3430,6 +3430,9 @@ int __init cmdline_parse_kernelcore(char *p)
 
 	return 0;
 }
+
+early_param("kernelcore", cmdline_parse_kernelcore);
+
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 
 /**
-- 
cgit v1.3-8-gc7d7


From 5ad333eb66ff1e52a87639822ae088577669dcf9 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Tue, 17 Jul 2007 04:03:16 -0700
Subject: Lumpy Reclaim V4

When we are out of memory of a suitable size we enter reclaim.  The current
reclaim algorithm targets pages in LRU order, which is great for fairness at
order-0 but highly unsuitable if you desire pages at higher orders.  To get
pages of higher order we must shoot down a very high proportion of memory;
>95% in a lot of cases.

This patch set adds a lumpy reclaim algorithm to the allocator.  It targets
groups of pages at the specified order anchored at the end of the active and
inactive lists.  This encourages groups of pages at the requested orders to
move from active to inactive, and active to free lists.  This behaviour is
only triggered out of direct reclaim when higher order pages have been
requested.

This patch set is particularly effective when utilised with an
anti-fragmentation scheme which groups pages of similar reclaimability
together.

This patch set is based on Peter Zijlstra's lumpy reclaim V2 patch which forms
the foundation.  Credit to Mel Gorman for sanitity checking.

Mel said:

  The patches have an application with hugepage pool resizing.

  When lumpy-reclaim is used used with ZONE_MOVABLE, the hugepages pool can
  be resized with greater reliability.  Testing on a desktop machine with 2GB
  of RAM showed that growing the hugepage pool with ZONE_MOVABLE on it's own
  was very slow as the success rate was quite low.  Without lumpy-reclaim,
  each attempt to grow the pool by 100 pages would yield 1 or 2 hugepages.
  With lumpy-reclaim, getting 40 to 70 hugepages on each attempt was typical.

[akpm@osdl.org: ia64 pfn_to_nid fixes and loop cleanup]
[bunk@stusta.de: static declarations for internal functions]
[a.p.zijlstra@chello.nl: initial lumpy V2 implementation]
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c            |   2 +-
 include/linux/mmzone.h |   8 +++
 include/linux/swap.h   |   3 +-
 mm/page_alloc.c        |   5 +-
 mm/vmscan.c            | 171 ++++++++++++++++++++++++++++++++++++++++++-------
 5 files changed, 163 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 94344b2e0b46..d654a3b6209e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -356,7 +356,7 @@ static void free_more_memory(void)
 	for_each_online_pgdat(pgdat) {
 		zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
 		if (*zones)
-			try_to_free_pages(zones, GFP_NOFS);
+			try_to_free_pages(zones, 0, GFP_NOFS);
 	}
 }
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d71ff763c9df..da8eb8ad9e9b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -24,6 +24,14 @@
 #endif
 #define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
 
+/*
+ * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
+ * costly to service.  That is between allocation orders which should
+ * coelesce naturally under reasonable reclaim pressure and those which
+ * will not.
+ */
+#define PAGE_ALLOC_COSTLY_ORDER 3
+
 struct free_area {
 	struct list_head	free_list;
 	unsigned long		nr_free;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 006868881346..665f85f2a3af 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -188,7 +188,8 @@ extern int rotate_reclaimable_page(struct page *page);
 extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
-extern unsigned long try_to_free_pages(struct zone **, gfp_t);
+extern unsigned long try_to_free_pages(struct zone **zones, int order,
+					gfp_t gfp_mask);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ac4f8c6b5c10..1a889c3fec59 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1333,7 +1333,7 @@ nofail_alloc:
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
+	did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask);
 
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
@@ -1370,7 +1370,8 @@ nofail_alloc:
 	 */
 	do_retry = 0;
 	if (!(gfp_mask & __GFP_NORETRY)) {
-		if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
+		if ((order <= PAGE_ALLOC_COSTLY_ORDER) ||
+						(gfp_mask & __GFP_REPEAT))
 			do_retry = 1;
 		if (gfp_mask & __GFP_NOFAIL)
 			do_retry = 1;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1be5a6376ef0..1d9971d8924b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -66,6 +66,8 @@ struct scan_control {
 	int swappiness;
 
 	int all_unreclaimable;
+
+	int order;
 };
 
 /*
@@ -481,7 +483,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
 		referenced = page_referenced(page, 1);
 		/* In active use or really unfreeable?  Activate it. */
-		if (referenced && page_mapping_inuse(page))
+		if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
+					referenced && page_mapping_inuse(page))
 			goto activate_locked;
 
 #ifdef CONFIG_SWAP
@@ -514,7 +517,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		}
 
 		if (PageDirty(page)) {
-			if (referenced)
+			if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
 				goto keep_locked;
 			if (!may_enter_fs)
 				goto keep_locked;
@@ -598,6 +601,51 @@ keep:
 	return nr_reclaimed;
 }
 
+/* LRU Isolation modes. */
+#define ISOLATE_INACTIVE 0	/* Isolate inactive pages. */
+#define ISOLATE_ACTIVE 1	/* Isolate active pages. */
+#define ISOLATE_BOTH 2		/* Isolate both active and inactive pages. */
+
+/*
+ * Attempt to remove the specified page from its LRU.  Only take this page
+ * if it is of the appropriate PageActive status.  Pages which are being
+ * freed elsewhere are also ignored.
+ *
+ * page:	page to consider
+ * mode:	one of the LRU isolation modes defined above
+ *
+ * returns 0 on success, -ve errno on failure.
+ */
+static int __isolate_lru_page(struct page *page, int mode)
+{
+	int ret = -EINVAL;
+
+	/* Only take pages on the LRU. */
+	if (!PageLRU(page))
+		return ret;
+
+	/*
+	 * When checking the active state, we need to be sure we are
+	 * dealing with comparible boolean values.  Take the logical not
+	 * of each.
+	 */
+	if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+		return ret;
+
+	ret = -EBUSY;
+	if (likely(get_page_unless_zero(page))) {
+		/*
+		 * Be careful not to clear PageLRU until after we're
+		 * sure the page is not being freed elsewhere -- the
+		 * page release code relies on it.
+		 */
+		ClearPageLRU(page);
+		ret = 0;
+	}
+
+	return ret;
+}
+
 /*
  * zone->lru_lock is heavily contended.  Some of the functions that
  * shrink the lists perform better by taking out a batch of pages
@@ -612,44 +660,114 @@ keep:
  * @src:	The LRU list to pull pages off.
  * @dst:	The temp list to put pages on to.
  * @scanned:	The number of pages that were scanned.
+ * @order:	The caller's attempted allocation order
+ * @mode:	One of the LRU isolation modes
  *
  * returns how many pages were moved onto *@dst.
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		struct list_head *src, struct list_head *dst,
-		unsigned long *scanned)
+		unsigned long *scanned, int order, int mode)
 {
 	unsigned long nr_taken = 0;
-	struct page *page;
 	unsigned long scan;
 
 	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
-		struct list_head *target;
+		struct page *page;
+		unsigned long pfn;
+		unsigned long end_pfn;
+		unsigned long page_pfn;
+		int zone_id;
+
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
 		VM_BUG_ON(!PageLRU(page));
 
-		list_del(&page->lru);
-		target = src;
-		if (likely(get_page_unless_zero(page))) {
-			/*
-			 * Be careful not to clear PageLRU until after we're
-			 * sure the page is not being freed elsewhere -- the
-			 * page release code relies on it.
-			 */
-			ClearPageLRU(page);
-			target = dst;
+		switch (__isolate_lru_page(page, mode)) {
+		case 0:
+			list_move(&page->lru, dst);
 			nr_taken++;
-		} /* else it is being freed elsewhere */
+			break;
 
-		list_add(&page->lru, target);
+		case -EBUSY:
+			/* else it is being freed elsewhere */
+			list_move(&page->lru, src);
+			continue;
+
+		default:
+			BUG();
+		}
+
+		if (!order)
+			continue;
+
+		/*
+		 * Attempt to take all pages in the order aligned region
+		 * surrounding the tag page.  Only take those pages of
+		 * the same active state as that tag page.  We may safely
+		 * round the target page pfn down to the requested order
+		 * as the mem_map is guarenteed valid out to MAX_ORDER,
+		 * where that page is in a different zone we will detect
+		 * it from its zone id and abort this block scan.
+		 */
+		zone_id = page_zone_id(page);
+		page_pfn = page_to_pfn(page);
+		pfn = page_pfn & ~((1 << order) - 1);
+		end_pfn = pfn + (1 << order);
+		for (; pfn < end_pfn; pfn++) {
+			struct page *cursor_page;
+
+			/* The target page is in the block, ignore it. */
+			if (unlikely(pfn == page_pfn))
+				continue;
+
+			/* Avoid holes within the zone. */
+			if (unlikely(!pfn_valid_within(pfn)))
+				break;
+
+			cursor_page = pfn_to_page(pfn);
+			/* Check that we have not crossed a zone boundary. */
+			if (unlikely(page_zone_id(cursor_page) != zone_id))
+				continue;
+			switch (__isolate_lru_page(cursor_page, mode)) {
+			case 0:
+				list_move(&cursor_page->lru, dst);
+				nr_taken++;
+				scan++;
+				break;
+
+			case -EBUSY:
+				/* else it is being freed elsewhere */
+				list_move(&cursor_page->lru, src);
+			default:
+				break;
+			}
+		}
 	}
 
 	*scanned = scan;
 	return nr_taken;
 }
 
+/*
+ * clear_active_flags() is a helper for shrink_active_list(), clearing
+ * any active bits from the pages in the list.
+ */
+static unsigned long clear_active_flags(struct list_head *page_list)
+{
+	int nr_active = 0;
+	struct page *page;
+
+	list_for_each_entry(page, page_list, lru)
+		if (PageActive(page)) {
+			ClearPageActive(page);
+			nr_active++;
+		}
+
+	return nr_active;
+}
+
 /*
  * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
  * of reclaimed pages
@@ -671,11 +789,18 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 		unsigned long nr_taken;
 		unsigned long nr_scan;
 		unsigned long nr_freed;
+		unsigned long nr_active;
 
 		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
-					     &zone->inactive_list,
-					     &page_list, &nr_scan);
-		__mod_zone_page_state(zone, NR_INACTIVE, -nr_taken);
+			     &zone->inactive_list,
+			     &page_list, &nr_scan, sc->order,
+			     (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
+					     ISOLATE_BOTH : ISOLATE_INACTIVE);
+		nr_active = clear_active_flags(&page_list);
+
+		__mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
+		__mod_zone_page_state(zone, NR_INACTIVE,
+						-(nr_taken - nr_active));
 		zone->pages_scanned += nr_scan;
 		spin_unlock_irq(&zone->lru_lock);
 
@@ -820,7 +945,7 @@ force_reclaim_mapped:
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
 	pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
-				    &l_hold, &pgscanned);
+			    &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE);
 	zone->pages_scanned += pgscanned;
 	__mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
 	spin_unlock_irq(&zone->lru_lock);
@@ -1011,7 +1136,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
  * holds filesystem locks which prevent writeout this might not work, and the
  * allocation attempt will fail.
  */
-unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
 {
 	int priority;
 	int ret = 0;
@@ -1026,6 +1151,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
 		.may_swap = 1,
 		.swappiness = vm_swappiness,
+		.order = order,
 	};
 
 	count_vm_event(ALLOCSTALL);
@@ -1131,6 +1257,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 		.may_swap = 1,
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
 		.swappiness = vm_swappiness,
+		.order = order,
 	};
 	/*
 	 * temp_priority is used to remember the scanning priority at which
-- 
cgit v1.3-8-gc7d7


From 8e1f936b73150f5095448a0fee6d4f30a1f9001d Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 17 Jul 2007 04:03:17 -0700
Subject: mm: clean up and kernelify shrinker registration

I can never remember what the function to register to receive VM pressure
is called.  I have to trace down from __alloc_pages() to find it.

It's called "set_shrinker()", and it needs Your Help.

1) Don't hide struct shrinker.  It contains no magic.
2) Don't allocate "struct shrinker".  It's not helpful.
3) Call them "register_shrinker" and "unregister_shrinker".
4) Call the function "shrink" not "shrinker".
5) Reduce the 17 lines of waffly comments to 13, but document it properly.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: David Chinner <dgc@sgi.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c                |  7 ++++++-
 fs/dquot.c                 |  7 ++++++-
 fs/inode.c                 |  7 ++++++-
 fs/mbcache.c               |  9 ++++++---
 fs/nfs/super.c             | 10 ++++++----
 fs/xfs/linux-2.6/xfs_buf.c | 14 ++++++--------
 fs/xfs/quota/xfs_qm.c      | 10 +++++++---
 include/linux/mm.h         | 38 +++++++++++++++++++++-----------------
 mm/vmscan.c                | 42 +++++++++++-------------------------------
 net/sunrpc/auth.c          | 11 ++++++-----
 10 files changed, 81 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 0e73aa0a0e8b..cb9d05056b54 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -883,6 +883,11 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
 	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 
+static struct shrinker dcache_shrinker = {
+	.shrink = shrink_dcache_memory,
+	.seeks = DEFAULT_SEEKS,
+};
+
 /**
  * d_alloc	-	allocate a dcache entry
  * @parent: parent of entry to allocate
@@ -2115,7 +2120,7 @@ static void __init dcache_init(unsigned long mempages)
 	dentry_cache = KMEM_CACHE(dentry,
 		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
 	
-	set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory);
+	register_shrinker(&dcache_shrinker);
 
 	/* Hash may have been set up in dcache_init_early */
 	if (!hashdist)
diff --git a/fs/dquot.c b/fs/dquot.c
index 8819d281500c..7e273151f589 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -538,6 +538,11 @@ static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
 	return (dqstats.free_dquots / 100) * sysctl_vfs_cache_pressure;
 }
 
+static struct shrinker dqcache_shrinker = {
+	.shrink = shrink_dqcache_memory,
+	.seeks = DEFAULT_SEEKS,
+};
+
 /*
  * Put reference to dquot
  * NOTE: If you change this function please check whether dqput_blocks() works right...
@@ -1870,7 +1875,7 @@ static int __init dquot_init(void)
 	printk("Dquot-cache hash table entries: %ld (order %ld, %ld bytes)\n",
 			nr_hash, order, (PAGE_SIZE << order));
 
-	set_shrinker(DEFAULT_SEEKS, shrink_dqcache_memory);
+	register_shrinker(&dqcache_shrinker);
 
 	return 0;
 }
diff --git a/fs/inode.c b/fs/inode.c
index 47b87b071de3..320e088d0b28 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -462,6 +462,11 @@ static int shrink_icache_memory(int nr, gfp_t gfp_mask)
 	return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 
+static struct shrinker icache_shrinker = {
+	.shrink = shrink_icache_memory,
+	.seeks = DEFAULT_SEEKS,
+};
+
 static void __wait_on_freeing_inode(struct inode *inode);
 /*
  * Called with the inode lock held.
@@ -1385,7 +1390,7 @@ void __init inode_init(unsigned long mempages)
 					 SLAB_MEM_SPREAD),
 					 init_once,
 					 NULL);
-	set_shrinker(DEFAULT_SEEKS, shrink_icache_memory);
+	register_shrinker(&icache_shrinker);
 
 	/* Hash may have been set up in inode_init_early */
 	if (!hashdist)
diff --git a/fs/mbcache.c b/fs/mbcache.c
index deeb9dc062d9..fbb1d02f8791 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -100,7 +100,6 @@ struct mb_cache {
 static LIST_HEAD(mb_cache_list);
 static LIST_HEAD(mb_cache_lru_list);
 static DEFINE_SPINLOCK(mb_cache_spinlock);
-static struct shrinker *mb_shrinker;
 
 static inline int
 mb_cache_indexes(struct mb_cache *cache)
@@ -118,6 +117,10 @@ mb_cache_indexes(struct mb_cache *cache)
 
 static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask);
 
+static struct shrinker mb_cache_shrinker = {
+	.shrink = mb_cache_shrink_fn,
+	.seeks = DEFAULT_SEEKS,
+};
 
 static inline int
 __mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
@@ -662,13 +665,13 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev, int index,
 
 static int __init init_mbcache(void)
 {
-	mb_shrinker = set_shrinker(DEFAULT_SEEKS, mb_cache_shrink_fn);
+	register_shrinker(&mb_cache_shrinker);
 	return 0;
 }
 
 static void __exit exit_mbcache(void)
 {
-	remove_shrinker(mb_shrinker);
+	unregister_shrinker(&mb_cache_shrinker);
 }
 
 module_init(init_mbcache)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a2b1af89ca1a..adffe1615c51 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -300,7 +300,10 @@ static const struct super_operations nfs4_sops = {
 };
 #endif
 
-static struct shrinker *acl_shrinker;
+static struct shrinker acl_shrinker = {
+	.shrink		= nfs_access_cache_shrinker,
+	.seeks		= DEFAULT_SEEKS,
+};
 
 /*
  * Register the NFS filesystems
@@ -321,7 +324,7 @@ int __init register_nfs_fs(void)
 	if (ret < 0)
 		goto error_2;
 #endif
-	acl_shrinker = set_shrinker(DEFAULT_SEEKS, nfs_access_cache_shrinker);
+	register_shrinker(&acl_shrinker);
 	return 0;
 
 #ifdef CONFIG_NFS_V4
@@ -339,8 +342,7 @@ error_0:
  */
 void __exit unregister_nfs_fs(void)
 {
-	if (acl_shrinker != NULL)
-		remove_shrinker(acl_shrinker);
+	unregister_shrinker(&acl_shrinker);
 #ifdef CONFIG_NFS_V4
 	unregister_filesystem(&nfs4_fs_type);
 	nfs_unregister_sysctl();
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 2df63622354e..b0f0e58866de 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -35,10 +35,13 @@
 #include <linux/freezer.h>
 
 static kmem_zone_t *xfs_buf_zone;
-static struct shrinker *xfs_buf_shake;
 STATIC int xfsbufd(void *);
 STATIC int xfsbufd_wakeup(int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
+static struct shrinker xfs_buf_shake = {
+	.shrink = xfsbufd_wakeup,
+	.seeks = DEFAULT_SEEKS,
+};
 
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -1832,14 +1835,9 @@ xfs_buf_init(void)
 	if (!xfsdatad_workqueue)
 		goto out_destroy_xfslogd_workqueue;
 
-	xfs_buf_shake = set_shrinker(DEFAULT_SEEKS, xfsbufd_wakeup);
-	if (!xfs_buf_shake)
-		goto out_destroy_xfsdatad_workqueue;
-
+	register_shrinker(&xfs_buf_shake);
 	return 0;
 
- out_destroy_xfsdatad_workqueue:
-	destroy_workqueue(xfsdatad_workqueue);
  out_destroy_xfslogd_workqueue:
 	destroy_workqueue(xfslogd_workqueue);
  out_free_buf_zone:
@@ -1854,7 +1852,7 @@ xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
-	remove_shrinker(xfs_buf_shake);
+	unregister_shrinker(&xfs_buf_shake);
 	destroy_workqueue(xfsdatad_workqueue);
 	destroy_workqueue(xfslogd_workqueue);
 	kmem_zone_destroy(xfs_buf_zone);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 7def4c699343..2d274b23ade5 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -62,7 +62,6 @@ uint		ndquot;
 
 kmem_zone_t	*qm_dqzone;
 kmem_zone_t	*qm_dqtrxzone;
-static struct shrinker *xfs_qm_shaker;
 
 static cred_t	xfs_zerocr;
 
@@ -78,6 +77,11 @@ STATIC int	xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int	xfs_qm_init_quotainfo(xfs_mount_t *);
 STATIC int	xfs_qm_shake(int, gfp_t);
 
+static struct shrinker xfs_qm_shaker = {
+	.shrink = xfs_qm_shake,
+	.seeks = DEFAULT_SEEKS,
+};
+
 #ifdef DEBUG
 extern mutex_t	qcheck_lock;
 #endif
@@ -149,7 +153,7 @@ xfs_Gqm_init(void)
 	} else
 		xqm->qm_dqzone = qm_dqzone;
 
-	xfs_qm_shaker = set_shrinker(DEFAULT_SEEKS, xfs_qm_shake);
+	register_shrinker(&xfs_qm_shaker);
 
 	/*
 	 * The t_dqinfo portion of transactions.
@@ -181,7 +185,7 @@ xfs_qm_destroy(
 
 	ASSERT(xqm != NULL);
 	ASSERT(xqm->qm_nrefs == 0);
-	remove_shrinker(xfs_qm_shaker);
+	unregister_shrinker(&xfs_qm_shaker);
 	hsize = xqm->qm_dqhashmask + 1;
 	for (i = 0; i < hsize; i++) {
 		xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 97d0cddfd223..4c482a3ee870 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -810,27 +810,31 @@ extern unsigned long do_mremap(unsigned long addr,
 			       unsigned long flags, unsigned long new_addr);
 
 /*
- * Prototype to add a shrinker callback for ageable caches.
- * 
- * These functions are passed a count `nr_to_scan' and a gfpmask.  They should
- * scan `nr_to_scan' objects, attempting to free them.
+ * A callback you can register to apply pressure to ageable caches.
  *
- * The callback must return the number of objects which remain in the cache.
+ * 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'.  It should
+ * look through the least-recently-used 'nr_to_scan' entries and
+ * attempt to free them up.  It should return the number of objects
+ * which remain in the cache.  If it returns -1, it means it cannot do
+ * any scanning at this time (eg. there is a risk of deadlock).
  *
- * The callback will be passed nr_to_scan == 0 when the VM is querying the
- * cache size, so a fastpath for that case is appropriate.
- */
-typedef int (*shrinker_t)(int nr_to_scan, gfp_t gfp_mask);
-
-/*
- * Add an aging callback.  The int is the number of 'seeks' it takes
- * to recreate one of the objects that these functions age.
+ * The 'gfpmask' refers to the allocation we are currently trying to
+ * fulfil.
+ *
+ * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
+ * querying the cache size, so a fastpath for that case is appropriate.
  */
+struct shrinker {
+	int (*shrink)(int nr_to_scan, gfp_t gfp_mask);
+	int seeks;	/* seeks to recreate an obj */
 
-#define DEFAULT_SEEKS 2
-struct shrinker;
-extern struct shrinker *set_shrinker(int, shrinker_t);
-extern void remove_shrinker(struct shrinker *shrinker);
+	/* These are for internal use */
+	struct list_head list;
+	long nr;	/* objs pending delete */
+};
+#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
+extern void register_shrinker(struct shrinker *);
+extern void unregister_shrinker(struct shrinker *);
 
 /*
  * Some shared mappigns will want the pages marked read-only
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1d9971d8924b..2225b7c9df85 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -70,17 +70,6 @@ struct scan_control {
 	int order;
 };
 
-/*
- * The list of shrinker callbacks used by to apply pressure to
- * ageable caches.
- */
-struct shrinker {
-	shrinker_t		shrinker;
-	struct list_head	list;
-	int			seeks;	/* seeks to recreate an obj */
-	long			nr;	/* objs pending delete */
-};
-
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 
 #ifdef ARCH_HAS_PREFETCH
@@ -123,34 +112,25 @@ static DECLARE_RWSEM(shrinker_rwsem);
 /*
  * Add a shrinker callback to be called from the vm
  */
-struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
+void register_shrinker(struct shrinker *shrinker)
 {
-        struct shrinker *shrinker;
-
-        shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
-        if (shrinker) {
-	        shrinker->shrinker = theshrinker;
-	        shrinker->seeks = seeks;
-	        shrinker->nr = 0;
-	        down_write(&shrinker_rwsem);
-	        list_add_tail(&shrinker->list, &shrinker_list);
-	        up_write(&shrinker_rwsem);
-	}
-	return shrinker;
+	shrinker->nr = 0;
+	down_write(&shrinker_rwsem);
+	list_add_tail(&shrinker->list, &shrinker_list);
+	up_write(&shrinker_rwsem);
 }
-EXPORT_SYMBOL(set_shrinker);
+EXPORT_SYMBOL(register_shrinker);
 
 /*
  * Remove one
  */
-void remove_shrinker(struct shrinker *shrinker)
+void unregister_shrinker(struct shrinker *shrinker)
 {
 	down_write(&shrinker_rwsem);
 	list_del(&shrinker->list);
 	up_write(&shrinker_rwsem);
-	kfree(shrinker);
 }
-EXPORT_SYMBOL(remove_shrinker);
+EXPORT_SYMBOL(unregister_shrinker);
 
 #define SHRINK_BATCH 128
 /*
@@ -187,7 +167,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 	list_for_each_entry(shrinker, &shrinker_list, list) {
 		unsigned long long delta;
 		unsigned long total_scan;
-		unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
+		unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask);
 
 		delta = (4 * scanned) / shrinker->seeks;
 		delta *= max_pass;
@@ -215,8 +195,8 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 			int shrink_ret;
 			int nr_before;
 
-			nr_before = (*shrinker->shrinker)(0, gfp_mask);
-			shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
+			nr_before = (*shrinker->shrink)(0, gfp_mask);
+			shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
 			if (shrink_ret == -1)
 				break;
 			if (shrink_ret < nr_before)
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index aa55d0a03e6f..29a8ecc60928 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -543,17 +543,18 @@ rpcauth_uptodatecred(struct rpc_task *task)
 		test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0;
 }
 
-
-static struct shrinker *rpc_cred_shrinker;
+static struct shrinker rpc_cred_shrinker = {
+	.shrink = rpcauth_cache_shrinker,
+	.seeks = DEFAULT_SEEKS,
+};
 
 void __init rpcauth_init_module(void)
 {
 	rpc_init_authunix();
-	rpc_cred_shrinker = set_shrinker(DEFAULT_SEEKS, rpcauth_cache_shrinker);
+	register_shrinker(&rpc_cred_shrinker);
 }
 
 void __exit rpcauth_remove_module(void)
 {
-	if (rpc_cred_shrinker != NULL)
-		remove_shrinker(rpc_cred_shrinker);
+	unregister_shrinker(&rpc_cred_shrinker);
 }
-- 
cgit v1.3-8-gc7d7


From 6cb8f91320d3e720351c21741da795fed580b21b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Tue, 17 Jul 2007 04:03:22 -0700
Subject: Slab allocators: consistent ZERO_SIZE_PTR support and NULL result
 semantics

Define ZERO_OR_NULL_PTR macro to be able to remove the checks from the
allocators.  Move ZERO_SIZE_PTR related stuff into slab.h.

Make ZERO_SIZE_PTR work for all slab allocators and get rid of the
WARN_ON_ONCE(size == 0) that is still remaining in SLAB.

Make slub return NULL like the other allocators if a too large memory segment
is requested via __kmalloc.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h     | 13 +++++++++++++
 include/linux/slab_def.h | 12 ++++++++++++
 include/linux/slub_def.h | 12 ------------
 mm/slab.c                | 13 ++++++++-----
 mm/slob.c                | 11 +++++++----
 mm/slub.c                | 29 ++++++++++++++++-------------
 mm/util.c                |  2 +-
 7 files changed, 57 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 27402fea9b79..0289ec89300a 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -30,6 +30,19 @@
 #define SLAB_MEM_SPREAD		0x00100000UL	/* Spread some memory over cpuset */
 #define SLAB_TRACE		0x00200000UL	/* Trace allocations and frees */
 
+/*
+ * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
+ *
+ * Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault.
+ *
+ * ZERO_SIZE_PTR can be passed to kfree though in the same way that NULL can.
+ * Both make kfree a no-op.
+ */
+#define ZERO_SIZE_PTR ((void *)16)
+
+#define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) < \
+				(unsigned long)ZERO_SIZE_PTR)
+
 /*
  * struct kmem_cache related prototypes
  */
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 365d036c454a..16e814ffab8d 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -32,6 +32,10 @@ static inline void *kmalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size)) {
 		int i = 0;
+
+		if (!size)
+			return ZERO_SIZE_PTR;
+
 #define CACHE(x) \
 		if (size <= x) \
 			goto found; \
@@ -58,6 +62,10 @@ static inline void *kzalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size)) {
 		int i = 0;
+
+		if (!size)
+			return ZERO_SIZE_PTR;
+
 #define CACHE(x) \
 		if (size <= x) \
 			goto found; \
@@ -88,6 +96,10 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	if (__builtin_constant_p(size)) {
 		int i = 0;
+
+		if (!size)
+			return ZERO_SIZE_PTR;
+
 #define CACHE(x) \
 		if (size <= x) \
 			goto found; \
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index a582f6771525..579b0a22858e 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -159,18 +159,6 @@ static inline struct kmem_cache *kmalloc_slab(size_t size)
 #define SLUB_DMA 0
 #endif
 
-
-/*
- * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
- *
- * Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault.
- *
- * ZERO_SIZE_PTR can be passed to kfree though in the same way that NULL can.
- * Both make kfree a no-op.
- */
-#define ZERO_SIZE_PTR ((void *)16)
-
-
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
 void *__kmalloc(size_t size, gfp_t flags);
 
diff --git a/mm/slab.c b/mm/slab.c
index 4bd8a53091b7..d2cd304fd8af 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -775,6 +775,9 @@ static inline struct kmem_cache *__find_general_cachep(size_t size,
 	 */
 	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
 #endif
+	if (!size)
+		return ZERO_SIZE_PTR;
+
 	while (size > csizep->cs_size)
 		csizep++;
 
@@ -2351,7 +2354,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		 * this should not happen at all.
 		 * But leave a BUG_ON for some lucky dude.
 		 */
-		BUG_ON(!cachep->slabp_cache);
+		BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
 	}
 	cachep->ctor = ctor;
 	cachep->name = name;
@@ -3653,8 +3656,8 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
 	struct kmem_cache *cachep;
 
 	cachep = kmem_find_general_cachep(size, flags);
-	if (unlikely(cachep == NULL))
-		return NULL;
+	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
+		return cachep;
 	return kmem_cache_alloc_node(cachep, flags, node);
 }
 
@@ -3760,7 +3763,7 @@ void kfree(const void *objp)
 	struct kmem_cache *c;
 	unsigned long flags;
 
-	if (unlikely(!objp))
+	if (unlikely(ZERO_OR_NULL_PTR(objp)))
 		return;
 	local_irq_save(flags);
 	kfree_debugcheck(objp);
@@ -4447,7 +4450,7 @@ const struct seq_operations slabstats_op = {
  */
 size_t ksize(const void *objp)
 {
-	if (unlikely(objp == NULL))
+	if (unlikely(ZERO_OR_NULL_PTR(objp)))
 		return 0;
 
 	return obj_size(virt_to_cache(objp));
diff --git a/mm/slob.c b/mm/slob.c
index 154e7bdf3544..41d32c3c0be4 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -347,7 +347,7 @@ static void slob_free(void *block, int size)
 	slobidx_t units;
 	unsigned long flags;
 
-	if (!block)
+	if (ZERO_OR_NULL_PTR(block))
 		return;
 	BUG_ON(!size);
 
@@ -424,10 +424,13 @@ out:
 
 void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 {
+	unsigned int *m;
 	int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
 
 	if (size < PAGE_SIZE - align) {
-		unsigned int *m;
+		if (!size)
+			return ZERO_SIZE_PTR;
+
 		m = slob_alloc(size + align, gfp, align, node);
 		if (m)
 			*m = size;
@@ -450,7 +453,7 @@ void kfree(const void *block)
 {
 	struct slob_page *sp;
 
-	if (!block)
+	if (ZERO_OR_NULL_PTR(block))
 		return;
 
 	sp = (struct slob_page *)virt_to_page(block);
@@ -468,7 +471,7 @@ size_t ksize(const void *block)
 {
 	struct slob_page *sp;
 
-	if (!block)
+	if (ZERO_OR_NULL_PTR(block))
 		return 0;
 
 	sp = (struct slob_page *)virt_to_page(block);
diff --git a/mm/slub.c b/mm/slub.c
index 1b0a95d75dbb..548d78df81e1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2270,10 +2270,11 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
 	int index = kmalloc_index(size);
 
 	if (!index)
-		return NULL;
+		return ZERO_SIZE_PTR;
 
 	/* Allocation too large? */
-	BUG_ON(index < 0);
+	if (index < 0)
+		return NULL;
 
 #ifdef CONFIG_ZONE_DMA
 	if ((flags & SLUB_DMA)) {
@@ -2314,9 +2315,10 @@ void *__kmalloc(size_t size, gfp_t flags)
 {
 	struct kmem_cache *s = get_slab(size, flags);
 
-	if (s)
-		return slab_alloc(s, flags, -1, __builtin_return_address(0));
-	return ZERO_SIZE_PTR;
+	if (ZERO_OR_NULL_PTR(s))
+		return s;
+
+	return slab_alloc(s, flags, -1, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(__kmalloc);
 
@@ -2325,9 +2327,10 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	struct kmem_cache *s = get_slab(size, flags);
 
-	if (s)
-		return slab_alloc(s, flags, node, __builtin_return_address(0));
-	return ZERO_SIZE_PTR;
+	if (ZERO_OR_NULL_PTR(s))
+		return s;
+
+	return slab_alloc(s, flags, node, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(__kmalloc_node);
 #endif
@@ -2378,7 +2381,7 @@ void kfree(const void *x)
 	 * this comparison would be true for all "negative" pointers
 	 * (which would cover the whole upper half of the address space).
 	 */
-	if ((unsigned long)x <= (unsigned long)ZERO_SIZE_PTR)
+	if (ZERO_OR_NULL_PTR(x))
 		return;
 
 	page = virt_to_head_page(x);
@@ -2687,8 +2690,8 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
 {
 	struct kmem_cache *s = get_slab(size, gfpflags);
 
-	if (!s)
-		return ZERO_SIZE_PTR;
+	if (ZERO_OR_NULL_PTR(s))
+		return s;
 
 	return slab_alloc(s, gfpflags, -1, caller);
 }
@@ -2698,8 +2701,8 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 {
 	struct kmem_cache *s = get_slab(size, gfpflags);
 
-	if (!s)
-		return ZERO_SIZE_PTR;
+	if (ZERO_OR_NULL_PTR(s))
+		return s;
 
 	return slab_alloc(s, gfpflags, node, caller);
 }
diff --git a/mm/util.c b/mm/util.c
index 18396ea63ee6..f2f21b775516 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -76,7 +76,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
 
 	if (unlikely(!new_size)) {
 		kfree(p);
-		return NULL;
+		return ZERO_SIZE_PTR;
 	}
 
 	ks = ksize(p);
-- 
cgit v1.3-8-gc7d7


From 0c710013200e72b5e0bc680ff4ec6bdac53c5ce8 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Tue, 17 Jul 2007 04:03:24 -0700
Subject: SLUB: add some more inlines and #ifdef CONFIG_SLUB_DEBUG

Add #ifdefs around data structures only needed if debugging is compiled into
SLUB.

Add inlines to small functions to reduce code size.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h |  4 ++++
 mm/slub.c                | 13 +++++++------
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 579b0a22858e..bae11111458f 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -16,7 +16,9 @@ struct kmem_cache_node {
 	unsigned long nr_partial;
 	atomic_long_t nr_slabs;
 	struct list_head partial;
+#ifdef CONFIG_SLUB_DEBUG
 	struct list_head full;
+#endif
 };
 
 /*
@@ -44,7 +46,9 @@ struct kmem_cache {
 	int align;		/* Alignment */
 	const char *name;	/* Name (only for display!) */
 	struct list_head list;	/* List of slab caches */
+#ifdef CONFIG_SLUB_DEBUG
 	struct kobject kobj;	/* For sysfs */
+#endif
 
 #ifdef CONFIG_NUMA
 	int defrag_ratio;
diff --git a/mm/slub.c b/mm/slub.c
index 479eb5c01917..55b508df62a3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -252,9 +252,10 @@ static int sysfs_slab_add(struct kmem_cache *);
 static int sysfs_slab_alias(struct kmem_cache *, const char *);
 static void sysfs_slab_remove(struct kmem_cache *);
 #else
-static int sysfs_slab_add(struct kmem_cache *s) { return 0; }
-static int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; }
-static void sysfs_slab_remove(struct kmem_cache *s) {}
+static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
+static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
+							{ return 0; }
+static inline void sysfs_slab_remove(struct kmem_cache *s) {}
 #endif
 
 /********************************************************************
@@ -1395,7 +1396,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu)
 	unfreeze_slab(s, page);
 }
 
-static void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
+static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
 {
 	slab_lock(page);
 	deactivate_slab(s, page, cpu);
@@ -1405,7 +1406,7 @@ static void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
  * Flush cpu slab.
  * Called from IPI handler with interrupts disabled.
  */
-static void __flush_cpu_slab(struct kmem_cache *s, int cpu)
+static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
 {
 	struct page *page = s->cpu_slab[cpu];
 
@@ -2165,7 +2166,7 @@ static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
 /*
  * Release all resources used by a slab cache.
  */
-static int kmem_cache_close(struct kmem_cache *s)
+static inline int kmem_cache_close(struct kmem_cache *s)
 {
 	int node;
 
-- 
cgit v1.3-8-gc7d7


From 81cda6626178cd55297831296ba8ecedbfd8b52d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Tue, 17 Jul 2007 04:03:29 -0700
Subject: Slab allocators: Cleanup zeroing allocations

It becomes now easy to support the zeroing allocs with generic inline
functions in slab.h.  Provide inline definitions to allow the continued use of
kzalloc, kmem_cache_zalloc etc but remove other definitions of zeroing
functions from the slab allocators and util.c.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h     | 77 +++++++++++++++++++++++++++++-------------------
 include/linux/slab_def.h | 30 -------------------
 include/linux/slub_def.h | 13 --------
 mm/slab.c                | 17 -----------
 mm/slob.c                | 10 -------
 mm/slub.c                | 11 -------
 mm/util.c                | 14 ---------
 7 files changed, 46 insertions(+), 126 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 0289ec89300a..0e1d0daef6a2 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -55,7 +55,6 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			void (*)(void *, struct kmem_cache *, unsigned long));
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
-void *kmem_cache_zalloc(struct kmem_cache *, gfp_t);
 void kmem_cache_free(struct kmem_cache *, void *);
 unsigned int kmem_cache_size(struct kmem_cache *);
 const char *kmem_cache_name(struct kmem_cache *);
@@ -91,11 +90,37 @@ int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr);
 /*
  * Common kmalloc functions provided by all allocators
  */
-void *__kzalloc(size_t, gfp_t);
 void * __must_check krealloc(const void *, size_t, gfp_t);
 void kfree(const void *);
 size_t ksize(const void *);
 
+/*
+ * Allocator specific definitions. These are mainly used to establish optimized
+ * ways to convert kmalloc() calls to kmem_cache_alloc() invocations by
+ * selecting the appropriate general cache at compile time.
+ *
+ * Allocators must define at least:
+ *
+ *	kmem_cache_alloc()
+ *	__kmalloc()
+ *	kmalloc()
+ *
+ * Those wishing to support NUMA must also define:
+ *
+ *	kmem_cache_alloc_node()
+ *	kmalloc_node()
+ *
+ * See each allocator definition file for additional comments and
+ * implementation notes.
+ */
+#ifdef CONFIG_SLUB
+#include <linux/slub_def.h>
+#elif defined(CONFIG_SLOB)
+#include <linux/slob_def.h>
+#else
+#include <linux/slab_def.h>
+#endif
+
 /**
  * kcalloc - allocate memory for an array. The memory is set to zero.
  * @n: number of elements.
@@ -151,37 +176,9 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 {
 	if (n != 0 && size > ULONG_MAX / n)
 		return NULL;
-	return __kzalloc(n * size, flags);
+	return __kmalloc(n * size, flags | __GFP_ZERO);
 }
 
-/*
- * Allocator specific definitions. These are mainly used to establish optimized
- * ways to convert kmalloc() calls to kmem_cache_alloc() invocations by
- * selecting the appropriate general cache at compile time.
- *
- * Allocators must define at least:
- *
- *	kmem_cache_alloc()
- *	__kmalloc()
- *	kmalloc()
- *	kzalloc()
- *
- * Those wishing to support NUMA must also define:
- *
- *	kmem_cache_alloc_node()
- *	kmalloc_node()
- *
- * See each allocator definition file for additional comments and
- * implementation notes.
- */
-#ifdef CONFIG_SLUB
-#include <linux/slub_def.h>
-#elif defined(CONFIG_SLOB)
-#include <linux/slob_def.h>
-#else
-#include <linux/slab_def.h>
-#endif
-
 #if !defined(CONFIG_NUMA) && !defined(CONFIG_SLOB)
 /**
  * kmalloc_node - allocate memory from a specific node
@@ -255,5 +252,23 @@ extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, void *);
 
 #endif /* DEBUG_SLAB */
 
+/*
+ * Shortcuts
+ */
+static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
+{
+	return kmem_cache_alloc(k, flags | __GFP_ZERO);
+}
+
+/**
+ * kzalloc - allocate memory. The memory is set to zero.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kmalloc).
+ */
+static inline void *kzalloc(size_t size, gfp_t flags)
+{
+	return kmalloc(size, flags | __GFP_ZERO);
+}
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SLAB_H */
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 16e814ffab8d..32bdc2ffd715 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -58,36 +58,6 @@ found:
 	return __kmalloc(size, flags);
 }
 
-static inline void *kzalloc(size_t size, gfp_t flags)
-{
-	if (__builtin_constant_p(size)) {
-		int i = 0;
-
-		if (!size)
-			return ZERO_SIZE_PTR;
-
-#define CACHE(x) \
-		if (size <= x) \
-			goto found; \
-		else \
-			i++;
-#include "kmalloc_sizes.h"
-#undef CACHE
-		{
-			extern void __you_cannot_kzalloc_that_much(void);
-			__you_cannot_kzalloc_that_much();
-		}
-found:
-#ifdef CONFIG_ZONE_DMA
-		if (flags & GFP_DMA)
-			return kmem_cache_zalloc(malloc_sizes[i].cs_dmacachep,
-						flags);
-#endif
-		return kmem_cache_zalloc(malloc_sizes[i].cs_cachep, flags);
-	}
-	return __kzalloc(size, flags);
-}
-
 #ifdef CONFIG_NUMA
 extern void *__kmalloc_node(size_t size, gfp_t flags, int node);
 extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index bae11111458f..07f7e4cbcee3 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -179,19 +179,6 @@ static inline void *kmalloc(size_t size, gfp_t flags)
 		return __kmalloc(size, flags);
 }
 
-static inline void *kzalloc(size_t size, gfp_t flags)
-{
-	if (__builtin_constant_p(size) && !(flags & SLUB_DMA)) {
-		struct kmem_cache *s = kmalloc_slab(size);
-
-		if (!s)
-			return ZERO_SIZE_PTR;
-
-		return kmem_cache_zalloc(s, flags);
-	} else
-		return __kzalloc(size, flags);
-}
-
 #ifdef CONFIG_NUMA
 void *__kmalloc_node(size_t size, gfp_t flags, int node);
 void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
diff --git a/mm/slab.c b/mm/slab.c
index 1a88fded7f19..35056394139b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3589,23 +3589,6 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
-/**
- * kmem_cache_zalloc - Allocate an object. The memory is set to zero.
- * @cache: The cache to allocate from.
- * @flags: See kmalloc().
- *
- * Allocate an object from this cache and set the allocated memory to zero.
- * The flags are only relevant if the cache has no available objects.
- */
-void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
-{
-	void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
-	if (ret)
-		memset(ret, 0, obj_size(cache));
-	return ret;
-}
-EXPORT_SYMBOL(kmem_cache_zalloc);
-
 /**
  * kmem_ptr_validate - check if an untrusted pointer might
  *	be a slab entry.
diff --git a/mm/slob.c b/mm/slob.c
index b3a45588fc46..c89ef116d7aa 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -543,16 +543,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 
-void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
-{
-	void *ret = kmem_cache_alloc(c, flags);
-	if (ret)
-		memset(ret, 0, c->size);
-
-	return ret;
-}
-EXPORT_SYMBOL(kmem_cache_zalloc);
-
 static void __kmem_cache_free(void *b, int size)
 {
 	if (size < PAGE_SIZE)
diff --git a/mm/slub.c b/mm/slub.c
index 51ddd01604cd..510ee9a2cdb2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2706,17 +2706,6 @@ err:
 }
 EXPORT_SYMBOL(kmem_cache_create);
 
-void *kmem_cache_zalloc(struct kmem_cache *s, gfp_t flags)
-{
-	void *x;
-
-	x = slab_alloc(s, flags, -1, __builtin_return_address(0));
-	if (x)
-		memset(x, 0, s->objsize);
-	return x;
-}
-EXPORT_SYMBOL(kmem_cache_zalloc);
-
 #ifdef CONFIG_SMP
 /*
  * Use the cpu notifier to insure that the cpu slabs are flushed when
diff --git a/mm/util.c b/mm/util.c
index f2f21b775516..78f3783bdcc8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -5,20 +5,6 @@
 #include <asm/uaccess.h>
 
 /**
- * __kzalloc - allocate memory. The memory is set to zero.
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- */
-void *__kzalloc(size_t size, gfp_t flags)
-{
-	void *ret = kmalloc_track_caller(size, flags);
-	if (ret)
-		memset(ret, 0, size);
-	return ret;
-}
-EXPORT_SYMBOL(__kzalloc);
-
-/*
  * kstrdup - allocate space for and copy an existing string
  *
  * @s: the string to duplicate
-- 
cgit v1.3-8-gc7d7


From b5fab14e5d87df4d94161ae5f5e0c8625f9ffda2 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Tue, 17 Jul 2007 04:03:33 -0700
Subject: Add VM_BUG_ON in case someone uses page_mapping on a slab page

Detect slab objects being passed to the page oriented functions of the VM.

It is not sufficient to simply return NULL because the functions calling
page_mapping may depend on other items of the page_struct also to be setup
properly.  Moreover slab object may not be properly aligned.  The page
oriented functions of the VM expect to operate on page aligned, page sized
objects.  Operations on object straddling page boundaries may only affect the
objects partially which may lead to surprising results.

It is better to detect eventually remaining uses and eliminate them.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4c482a3ee870..a5c451816fdc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -599,6 +599,7 @@ static inline struct address_space *page_mapping(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 
+	VM_BUG_ON(PageSlab(page));
 	if (unlikely(PageSwapCache(page)))
 		mapping = &swapper_space;
 #ifdef CONFIG_SLUB
-- 
cgit v1.3-8-gc7d7


From 831441862956fffa17b9801db37e6ea1650b0f69 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 17 Jul 2007 04:03:35 -0700
Subject: Freezer: make kernel threads nonfreezable by default

Currently, the freezer treats all tasks as freezable, except for the kernel
threads that explicitly set the PF_NOFREEZE flag for themselves.  This
approach is problematic, since it requires every kernel thread to either
set PF_NOFREEZE explicitly, or call try_to_freeze(), even if it doesn't
care for the freezing of tasks at all.

It seems better to only require the kernel threads that want to or need to
be frozen to use some freezer-related code and to remove any
freezer-related code from the other (nonfreezable) kernel threads, which is
done in this patch.

The patch causes all kernel threads to be nonfreezable by default (ie.  to
have PF_NOFREEZE set by default) and introduces the set_freezable()
function that should be called by the freezable kernel threads in order to
unset PF_NOFREEZE.  It also makes all of the currently freezable kernel
threads call set_freezable(), so it shouldn't cause any (intentional)
change of behaviour to appear.  Additionally, it updates documentation to
describe the freezing of tasks more accurately.

[akpm@linux-foundation.org: build fixes]
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Nigel Cunningham <nigel@nigel.suspend2.net>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/power/freezing-of-tasks.txt | 160 ++++++++++++++++++++++++++++++
 Documentation/power/kernel_threads.txt    |  40 --------
 Documentation/power/swsusp.txt            |  18 +---
 arch/i386/kernel/apm.c                    |   2 +-
 arch/i386/kernel/io_apic.c                |   1 +
 drivers/block/loop.c                      |   8 +-
 drivers/block/pktcdvd.c                   |   1 +
 drivers/char/apm-emulation.c              |  12 +--
 drivers/char/hvc_console.c                |   1 +
 drivers/edac/edac_mc.c                    |   1 +
 drivers/ieee1394/ieee1394_core.c          |   3 +-
 drivers/ieee1394/nodemgr.c                |   1 +
 drivers/input/gameport/gameport.c         |   1 +
 drivers/input/serio/serio.c               |   1 +
 drivers/input/touchscreen/ucb1400_ts.c    |   1 +
 drivers/macintosh/therm_adt746x.c         |   1 +
 drivers/macintosh/windfarm_core.c         |   1 +
 drivers/md/md.c                           |   1 -
 drivers/media/dvb/dvb-core/dvb_frontend.c |   1 +
 drivers/media/video/cx88/cx88-tvaudio.c   |   1 +
 drivers/media/video/msp3400-kthreads.c    |   6 +-
 drivers/media/video/tvaudio.c             |   2 +-
 drivers/media/video/video-buf-dvb.c       |   1 +
 drivers/media/video/vivi.c                |   1 +
 drivers/mfd/ucb1x00-ts.c                  |   1 +
 drivers/mmc/card/queue.c                  |   7 +-
 drivers/mtd/mtd_blkdevs.c                 |   3 +-
 drivers/mtd/ubi/wl.c                      |   1 +
 drivers/net/wireless/airo.c               |   3 +-
 drivers/net/wireless/libertas/main.c      |   1 +
 drivers/pcmcia/cs.c                       |   1 +
 drivers/pnp/pnpbios/core.c                |   1 +
 drivers/scsi/libsas/sas_scsi_host.c       |   3 +-
 drivers/scsi/scsi_error.c                 |   3 +-
 drivers/usb/atm/ueagle-atm.c              |   1 +
 drivers/usb/core/hub.c                    |   1 +
 drivers/usb/gadget/file_storage.c         |   3 +
 drivers/usb/storage/usb.c                 |   3 +-
 drivers/video/ps3fb.c                     |   1 +
 drivers/w1/w1.c                           |   1 +
 fs/cifs/cifsfs.c                          |   1 +
 fs/cifs/connect.c                         |   1 +
 fs/jffs2/background.c                     |   1 +
 fs/lockd/svc.c                            |   2 +
 fs/nfs/callback.c                         |   2 +
 fs/nfsd/nfssvc.c                          |   2 +
 fs/xfs/linux-2.6/xfs_super.c              |   1 +
 include/linux/freezer.h                   |  14 +++
 init/do_mounts_initrd.c                   |   7 +-
 kernel/audit.c                            |   1 +
 kernel/exit.c                             |   6 ++
 kernel/fork.c                             |   2 +-
 kernel/rcutorture.c                       |   4 +-
 kernel/rtmutex-tester.c                   |   1 +
 kernel/sched.c                            |   3 -
 kernel/softirq.c                          |   3 +-
 kernel/softlockup.c                       |   2 +-
 kernel/workqueue.c                        |   4 +-
 mm/pdflush.c                              |   1 +
 mm/vmscan.c                               |   1 +
 net/bluetooth/bnep/core.c                 |   2 +-
 net/bluetooth/cmtp/core.c                 |   2 +-
 net/bluetooth/hidp/core.c                 |   2 +-
 net/bluetooth/rfcomm/core.c               |   2 +-
 net/core/pktgen.c                         |   2 +
 65 files changed, 256 insertions(+), 113 deletions(-)
 create mode 100644 Documentation/power/freezing-of-tasks.txt
 delete mode 100644 Documentation/power/kernel_threads.txt

(limited to 'include/linux')

diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt
new file mode 100644
index 000000000000..af1a282c71a3
--- /dev/null
+++ b/Documentation/power/freezing-of-tasks.txt
@@ -0,0 +1,160 @@
+Freezing of tasks
+	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+I. What is the freezing of tasks?
+
+The freezing of tasks is a mechanism by which user space processes and some
+kernel threads are controlled during hibernation or system-wide suspend (on some
+architectures).
+
+II. How does it work?
+
+There are four per-task flags used for that, PF_NOFREEZE, PF_FROZEN, TIF_FREEZE
+and PF_FREEZER_SKIP (the last one is auxiliary).  The tasks that have
+PF_NOFREEZE unset (all user space processes and some kernel threads) are
+regarded as 'freezable' and treated in a special way before the system enters a
+suspend state as well as before a hibernation image is created (in what follows
+we only consider hibernation, but the description also applies to suspend).
+
+Namely, as the first step of the hibernation procedure the function
+freeze_processes() (defined in kernel/power/process.c) is called.  It executes
+try_to_freeze_tasks() that sets TIF_FREEZE for all of the freezable tasks and
+sends a fake signal to each of them.  A task that receives such a signal and has
+TIF_FREEZE set, should react to it by calling the refrigerator() function
+(defined in kernel/power/process.c), which sets the task's PF_FROZEN flag,
+changes its state to TASK_UNINTERRUPTIBLE and makes it loop until PF_FROZEN is
+cleared for it.  Then, we say that the task is 'frozen' and therefore the set of
+functions handling this mechanism is called 'the freezer' (these functions are
+defined in kernel/power/process.c and include/linux/freezer.h).  User space
+processes are generally frozen before kernel threads.
+
+It is not recommended to call refrigerator() directly.  Instead, it is
+recommended to use the try_to_freeze() function (defined in
+include/linux/freezer.h), that checks the task's TIF_FREEZE flag and makes the
+task enter refrigerator() if the flag is set.
+
+For user space processes try_to_freeze() is called automatically from the
+signal-handling code, but the freezable kernel threads need to call it
+explicitly in suitable places.  The code to do this may look like the following:
+
+	do {
+		hub_events();
+		wait_event_interruptible(khubd_wait,
+					!list_empty(&hub_event_list));
+		try_to_freeze();
+	} while (!signal_pending(current));
+
+(from drivers/usb/core/hub.c::hub_thread()).
+
+If a freezable kernel thread fails to call try_to_freeze() after the freezer has
+set TIF_FREEZE for it, the freezing of tasks will fail and the entire
+hibernation operation will be cancelled.  For this reason, freezable kernel
+threads must call try_to_freeze() somewhere.
+
+After the system memory state has been restored from a hibernation image and
+devices have been reinitialized, the function thaw_processes() is called in
+order to clear the PF_FROZEN flag for each frozen task.  Then, the tasks that
+have been frozen leave refrigerator() and continue running.
+
+III. Which kernel threads are freezable?
+
+Kernel threads are not freezable by default.  However, a kernel thread may clear
+PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE
+directly is strongly discouraged).  From this point it is regarded as freezable
+and must call try_to_freeze() in a suitable place.
+
+IV. Why do we do that?
+
+Generally speaking, there is a couple of reasons to use the freezing of tasks:
+
+1. The principal reason is to prevent filesystems from being damaged after
+hibernation.  At the moment we have no simple means of checkpointing
+filesystems, so if there are any modifications made to filesystem data and/or
+metadata on disks, we cannot bring them back to the state from before the
+modifications.  At the same time each hibernation image contains some
+filesystem-related information that must be consistent with the state of the
+on-disk data and metadata after the system memory state has been restored from
+the image (otherwise the filesystems will be damaged in a nasty way, usually
+making them almost impossible to repair).  We therefore freeze tasks that might
+cause the on-disk filesystems' data and metadata to be modified after the
+hibernation image has been created and before the system is finally powered off.
+The majority of these are user space processes, but if any of the kernel threads
+may cause something like this to happen, they have to be freezable.
+
+2. The second reason is to prevent user space processes and some kernel threads
+from interfering with the suspending and resuming of devices.  A user space
+process running on a second CPU while we are suspending devices may, for
+example, be troublesome and without the freezing of tasks we would need some
+safeguards against race conditions that might occur in such a case.
+
+Although Linus Torvalds doesn't like the freezing of tasks, he said this in one
+of the discussions on LKML (http://lkml.org/lkml/2007/4/27/608):
+
+"RJW:> Why we freeze tasks at all or why we freeze kernel threads?
+
+Linus: In many ways, 'at all'.
+
+I _do_ realize the IO request queue issues, and that we cannot actually do
+s2ram with some devices in the middle of a DMA.  So we want to be able to
+avoid *that*, there's no question about that.  And I suspect that stopping
+user threads and then waiting for a sync is practically one of the easier
+ways to do so.
+
+So in practice, the 'at all' may become a 'why freeze kernel threads?' and
+freezing user threads I don't find really objectionable."
+
+Still, there are kernel threads that may want to be freezable.  For example, if
+a kernel that belongs to a device driver accesses the device directly, it in
+principle needs to know when the device is suspended, so that it doesn't try to
+access it at that time.  However, if the kernel thread is freezable, it will be
+frozen before the driver's .suspend() callback is executed and it will be
+thawed after the driver's .resume() callback has run, so it won't be accessing
+the device while it's suspended.
+
+3. Another reason for freezing tasks is to prevent user space processes from
+realizing that hibernation (or suspend) operation takes place.  Ideally, user
+space processes should not notice that such a system-wide operation has occurred
+and should continue running without any problems after the restore (or resume
+from suspend).  Unfortunately, in the most general case this is quite difficult
+to achieve without the freezing of tasks.  Consider, for example, a process
+that depends on all CPUs being online while it's running.  Since we need to
+disable nonboot CPUs during the hibernation, if this process is not frozen, it
+may notice that the number of CPUs has changed and may start to work incorrectly
+because of that.
+
+V. Are there any problems related to the freezing of tasks?
+
+Yes, there are.
+
+First of all, the freezing of kernel threads may be tricky if they depend one
+on another.  For example, if kernel thread A waits for a completion (in the
+TASK_UNINTERRUPTIBLE state) that needs to be done by freezable kernel thread B
+and B is frozen in the meantime, then A will be blocked until B is thawed, which
+may be undesirable.  That's why kernel threads are not freezable by default.
+
+Second, there are the following two problems related to the freezing of user
+space processes:
+1. Putting processes into an uninterruptible sleep distorts the load average.
+2. Now that we have FUSE, plus the framework for doing device drivers in
+userspace, it gets even more complicated because some userspace processes are
+now doing the sorts of things that kernel threads do
+(https://lists.linux-foundation.org/pipermail/linux-pm/2007-May/012309.html).
+
+The problem 1. seems to be fixable, although it hasn't been fixed so far.  The
+other one is more serious, but it seems that we can work around it by using
+hibernation (and suspend) notifiers (in that case, though, we won't be able to
+avoid the realization by the user space processes that the hibernation is taking
+place).
+
+There are also problems that the freezing of tasks tends to expose, although
+they are not directly related to it.  For example, if request_firmware() is
+called from a device driver's .resume() routine, it will timeout and eventually
+fail, because the user land process that should respond to the request is frozen
+at this point.  So, seemingly, the failure is due to the freezing of tasks.
+Suppose, however, that the firmware file is located on a filesystem accessible
+only through another device that hasn't been resumed yet.  In that case,
+request_firmware() will fail regardless of whether or not the freezing of tasks
+is used.  Consequently, the problem is not really related to the freezing of
+tasks, since it generally exists anyway.  [The solution to this particular
+problem is to keep the firmware in memory after it's loaded for the first time
+and upload if from memory to the device whenever necessary.]
diff --git a/Documentation/power/kernel_threads.txt b/Documentation/power/kernel_threads.txt
deleted file mode 100644
index fb57784986b1..000000000000
--- a/Documentation/power/kernel_threads.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-KERNEL THREADS
-
-
-Freezer
-
-Upon entering a suspended state the system will freeze all
-tasks. This is done by delivering pseudosignals. This affects
-kernel threads, too. To successfully freeze a kernel thread
-the thread has to check for the pseudosignal and enter the
-refrigerator. Code to do this looks like this:
-
-	do {
-		hub_events();
-		wait_event_interruptible(khubd_wait, !list_empty(&hub_event_list));
-		try_to_freeze();
-	} while (!signal_pending(current));
-
-from drivers/usb/core/hub.c::hub_thread()
-
-
-The Unfreezable
-
-Some kernel threads however, must not be frozen. The kernel must
-be able to finish pending IO operations and later on be able to
-write the memory image to disk. Kernel threads needed to do IO
-must stay awake. Such threads must mark themselves unfreezable
-like this:
-
-	/*
-	 * This thread doesn't need any user-level access,
-	 * so get rid of all our resources.
-	 */
-	daemonize("usb-storage");
-
-	current->flags |= PF_NOFREEZE;
-
-from drivers/usb/storage/usb.c::usb_stor_control_thread()
-
-Such drivers are themselves responsible for staying quiet during
-the actual snapshotting.
diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
index 152b510d1bbb..aea7e9209667 100644
--- a/Documentation/power/swsusp.txt
+++ b/Documentation/power/swsusp.txt
@@ -140,21 +140,11 @@ should be sent to the mailing list available through the suspend2
 website, and not to the Linux Kernel Mailing List. We are working
 toward merging suspend2 into the mainline kernel.
 
-Q: A kernel thread must voluntarily freeze itself (call 'refrigerator').
-I found some kernel threads that don't do it, and they don't freeze
-so the system can't sleep. Is this a known behavior?
-
-A: All such kernel threads need to be fixed, one by one. Select the
-place where the thread is safe to be frozen (no kernel semaphores
-should be held at that point and it must be safe to sleep there), and
-add:
-
-       try_to_freeze();
-
-If the thread is needed for writing the image to storage, you should
-instead set the PF_NOFREEZE process flag when creating the thread (and
-be very careful).
+Q: What is the freezing of tasks and why are we using it?
 
+A: The freezing of tasks is a mechanism by which user space processes and some
+kernel threads are controlled during hibernation or system-wide suspend (on some
+architectures).  See freezing-of-tasks.txt for details.
 
 Q: What is the difference between "platform" and "shutdown"?
 
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 4112afe712b9..47001d50a083 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -222,6 +222,7 @@
 #include <linux/capability.h>
 #include <linux/device.h>
 #include <linux/kernel.h>
+#include <linux/freezer.h>
 #include <linux/smp.h>
 #include <linux/dmi.h>
 #include <linux/suspend.h>
@@ -2311,7 +2312,6 @@ static int __init apm_init(void)
 		remove_proc_entry("apm", NULL);
 		return err;
 	}
-	kapmd_task->flags |= PF_NOFREEZE;
 	wake_up_process(kapmd_task);
 
 	if (num_online_cpus() > 1 && !smp ) {
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 7f8b7af2b95f..21db8f56c9a1 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -667,6 +667,7 @@ static int balanced_irq(void *unused)
 		set_pending_irq(i, cpumask_of_cpu(0));
 	}
 
+	set_freezable();
 	for ( ; ; ) {
 		time_remaining = schedule_timeout_interruptible(time_remaining);
 		try_to_freeze();
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 4503290da407..06eaa11cbc2f 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -68,6 +68,7 @@
 #include <linux/loop.h>
 #include <linux/compat.h>
 #include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>		/* for invalidate_bdev() */
 #include <linux/completion.h>
@@ -600,13 +601,6 @@ static int loop_thread(void *data)
 	struct loop_device *lo = data;
 	struct bio *bio;
 
-	/*
-	 * loop can be used in an encrypted device,
-	 * hence, it mustn't be stopped at all
-	 * because it could be indirectly used during suspension
-	 */
-	current->flags |= PF_NOFREEZE;
-
 	set_user_nice(current, -20);
 
 	while (!kthread_should_stop() || lo->lo_bio) {
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 7c294a40002e..31be33e4f119 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1593,6 +1593,7 @@ static int kcdrwd(void *foobar)
 	long min_sleep_time, residue;
 
 	set_user_nice(current, -20);
+	set_freezable();
 
 	for (;;) {
 		DECLARE_WAITQUEUE(wait, current);
diff --git a/drivers/char/apm-emulation.c b/drivers/char/apm-emulation.c
index 179c7a3b6e75..ec116df919d9 100644
--- a/drivers/char/apm-emulation.c
+++ b/drivers/char/apm-emulation.c
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/pm.h>
 #include <linux/apm-emulation.h>
+#include <linux/freezer.h>
 #include <linux/device.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
@@ -329,13 +330,8 @@ apm_ioctl(struct inode * inode, struct file *filp, u_int cmd, u_long arg)
 			/*
 			 * Wait for the suspend/resume to complete.  If there
 			 * are pending acknowledges, we wait here for them.
-			 *
-			 * Note: we need to ensure that the PM subsystem does
-			 * not kick us out of the wait when it suspends the
-			 * threads.
 			 */
 			flags = current->flags;
-			current->flags |= PF_NOFREEZE;
 
 			wait_event(apm_suspend_waitqueue,
 				   as->suspend_state == SUSPEND_DONE);
@@ -365,13 +361,8 @@ apm_ioctl(struct inode * inode, struct file *filp, u_int cmd, u_long arg)
 			/*
 			 * Wait for the suspend/resume to complete.  If there
 			 * are pending acknowledges, we wait here for them.
-			 *
-			 * Note: we need to ensure that the PM subsystem does
-			 * not kick us out of the wait when it suspends the
-			 * threads.
 			 */
 			flags = current->flags;
-			current->flags |= PF_NOFREEZE;
 
 			wait_event_interruptible(apm_suspend_waitqueue,
 					 as->suspend_state == SUSPEND_DONE);
@@ -598,7 +589,6 @@ static int __init apm_init(void)
 		kapmd_tsk = NULL;
 		return ret;
 	}
-	kapmd_tsk->flags |= PF_NOFREEZE;
 	wake_up_process(kapmd_tsk);
 
 #ifdef CONFIG_PROC_FS
diff --git a/drivers/char/hvc_console.c b/drivers/char/hvc_console.c
index b3ab42e0dd4a..83c1151ec7a2 100644
--- a/drivers/char/hvc_console.c
+++ b/drivers/char/hvc_console.c
@@ -679,6 +679,7 @@ static int khvcd(void *unused)
 	int poll_mask;
 	struct hvc_struct *hp;
 
+	set_freezable();
 	__set_current_state(TASK_RUNNING);
 	do {
 		poll_mask = 0;
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 7b622300d0e5..804875de5801 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -1906,6 +1906,7 @@ static void do_edac_check(void)
 
 static int edac_kernel_thread(void *arg)
 {
+	set_freezable();
 	while (!kthread_should_stop()) {
 		do_edac_check();
 
diff --git a/drivers/ieee1394/ieee1394_core.c b/drivers/ieee1394/ieee1394_core.c
index 0fc8c6e559e4..ee45259573c8 100644
--- a/drivers/ieee1394/ieee1394_core.c
+++ b/drivers/ieee1394/ieee1394_core.c
@@ -30,6 +30,7 @@
 #include <linux/moduleparam.h>
 #include <linux/bitops.h>
 #include <linux/kdev_t.h>
+#include <linux/freezer.h>
 #include <linux/suspend.h>
 #include <linux/kthread.h>
 #include <linux/preempt.h>
@@ -1128,8 +1129,6 @@ static int hpsbpkt_thread(void *__hi)
 	struct list_head tmp;
 	int may_schedule;
 
-	current->flags |= PF_NOFREEZE;
-
 	while (!kthread_should_stop()) {
 
 		INIT_LIST_HEAD(&tmp);
diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c
index 51a12062ed36..2ffd53461db6 100644
--- a/drivers/ieee1394/nodemgr.c
+++ b/drivers/ieee1394/nodemgr.c
@@ -1699,6 +1699,7 @@ static int nodemgr_host_thread(void *__hi)
 	unsigned int g, generation = 0;
 	int i, reset_cycles = 0;
 
+	set_freezable();
 	/* Setup our device-model entries */
 	nodemgr_create_host_dev_files(host);
 
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index bd686a2a517d..20896d5e5f0e 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -445,6 +445,7 @@ static struct gameport *gameport_get_pending_child(struct gameport *parent)
 
 static int gameport_thread(void *nothing)
 {
+	set_freezable();
 	do {
 		gameport_handle_event();
 		wait_event_interruptible(gameport_wait,
diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c
index a8f3bc1dff22..372ca4931194 100644
--- a/drivers/input/serio/serio.c
+++ b/drivers/input/serio/serio.c
@@ -384,6 +384,7 @@ static struct serio *serio_get_pending_child(struct serio *parent)
 
 static int serio_thread(void *nothing)
 {
+	set_freezable();
 	do {
 		serio_handle_event();
 		wait_event_interruptible(serio_wait,
diff --git a/drivers/input/touchscreen/ucb1400_ts.c b/drivers/input/touchscreen/ucb1400_ts.c
index f0cbcdb008ed..36f944019158 100644
--- a/drivers/input/touchscreen/ucb1400_ts.c
+++ b/drivers/input/touchscreen/ucb1400_ts.c
@@ -292,6 +292,7 @@ static int ucb1400_ts_thread(void *_ucb)
 
 	sched_setscheduler(tsk, SCHED_FIFO, &param);
 
+	set_freezable();
 	while (!kthread_should_stop()) {
 		unsigned int x, y, p;
 		long timeout;
diff --git a/drivers/macintosh/therm_adt746x.c b/drivers/macintosh/therm_adt746x.c
index bd55e6ab99fc..f25685b9b7cf 100644
--- a/drivers/macintosh/therm_adt746x.c
+++ b/drivers/macintosh/therm_adt746x.c
@@ -335,6 +335,7 @@ static int monitor_task(void *arg)
 {
 	struct thermostat* th = arg;
 
+	set_freezable();
 	while(!kthread_should_stop()) {
 		try_to_freeze();
 		msleep_interruptible(2000);
diff --git a/drivers/macintosh/windfarm_core.c b/drivers/macintosh/windfarm_core.c
index 4fcb245ba184..e18d265d5d33 100644
--- a/drivers/macintosh/windfarm_core.c
+++ b/drivers/macintosh/windfarm_core.c
@@ -92,6 +92,7 @@ static int wf_thread_func(void *data)
 
 	DBG("wf: thread started\n");
 
+	set_freezable();
 	while(!kthread_should_stop()) {
 		if (time_after_eq(jiffies, next)) {
 			wf_notify(WF_EVENT_TICK, NULL);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 33beaa7da085..9aefc4a023df 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4642,7 +4642,6 @@ static int md_thread(void * arg)
 	 * many dirty RAID5 blocks.
 	 */
 
-	current->flags |= PF_NOFREEZE;
 	allow_signal(SIGKILL);
 	while (!kthread_should_stop()) {
 
diff --git a/drivers/media/dvb/dvb-core/dvb_frontend.c b/drivers/media/dvb/dvb-core/dvb_frontend.c
index f4e4ca2dcade..b6c7f6610ec5 100644
--- a/drivers/media/dvb/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb/dvb-core/dvb_frontend.c
@@ -523,6 +523,7 @@ static int dvb_frontend_thread(void *data)
 
 	dvb_frontend_init(fe);
 
+	set_freezable();
 	while (1) {
 		up(&fepriv->sem);	    /* is locked when we enter the thread... */
 restart:
diff --git a/drivers/media/video/cx88/cx88-tvaudio.c b/drivers/media/video/cx88/cx88-tvaudio.c
index 259ea08e784f..1cc2d286a1cb 100644
--- a/drivers/media/video/cx88/cx88-tvaudio.c
+++ b/drivers/media/video/cx88/cx88-tvaudio.c
@@ -906,6 +906,7 @@ int cx88_audio_thread(void *data)
 	u32 mode = 0;
 
 	dprintk("cx88: tvaudio thread started\n");
+	set_freezable();
 	for (;;) {
 		msleep_interruptible(1000);
 		if (kthread_should_stop())
diff --git a/drivers/media/video/msp3400-kthreads.c b/drivers/media/video/msp3400-kthreads.c
index e1821eb82fb5..d5ee2629121e 100644
--- a/drivers/media/video/msp3400-kthreads.c
+++ b/drivers/media/video/msp3400-kthreads.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/i2c.h>
+#include <linux/freezer.h>
 #include <linux/videodev.h>
 #include <linux/videodev2.h>
 #include <media/v4l2-common.h>
@@ -468,6 +469,7 @@ int msp3400c_thread(void *data)
 
 
 	v4l_dbg(1, msp_debug, client, "msp3400 daemon started\n");
+	set_freezable();
 	for (;;) {
 		v4l_dbg(2, msp_debug, client, "msp3400 thread: sleep\n");
 		msp_sleep(state, -1);
@@ -646,7 +648,7 @@ int msp3410d_thread(void *data)
 	int val, i, std, count;
 
 	v4l_dbg(1, msp_debug, client, "msp3410 daemon started\n");
-
+	set_freezable();
 	for (;;) {
 		v4l_dbg(2, msp_debug, client, "msp3410 thread: sleep\n");
 		msp_sleep(state,-1);
@@ -940,7 +942,7 @@ int msp34xxg_thread(void *data)
 	int val, i;
 
 	v4l_dbg(1, msp_debug, client, "msp34xxg daemon started\n");
-
+	set_freezable();
 	for (;;) {
 		v4l_dbg(2, msp_debug, client, "msp34xxg thread: sleep\n");
 		msp_sleep(state, -1);
diff --git a/drivers/media/video/tvaudio.c b/drivers/media/video/tvaudio.c
index c9bf9dbc2ea3..9da338dc4f3b 100644
--- a/drivers/media/video/tvaudio.c
+++ b/drivers/media/video/tvaudio.c
@@ -271,7 +271,7 @@ static int chip_thread(void *data)
 	struct CHIPDESC  *desc = chiplist + chip->type;
 
 	v4l_dbg(1, debug, &chip->c, "%s: thread started\n", chip->c.name);
-
+	set_freezable();
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (!kthread_should_stop())
diff --git a/drivers/media/video/video-buf-dvb.c b/drivers/media/video/video-buf-dvb.c
index fcc5467e7636..e617925ba31e 100644
--- a/drivers/media/video/video-buf-dvb.c
+++ b/drivers/media/video/video-buf-dvb.c
@@ -47,6 +47,7 @@ static int videobuf_dvb_thread(void *data)
 	int err;
 
 	dprintk("dvb thread started\n");
+	set_freezable();
 	videobuf_read_start(&dvb->dvbq);
 
 	for (;;) {
diff --git a/drivers/media/video/vivi.c b/drivers/media/video/vivi.c
index f7e1d1910374..3ef4d0159c33 100644
--- a/drivers/media/video/vivi.c
+++ b/drivers/media/video/vivi.c
@@ -573,6 +573,7 @@ static int vivi_thread(void *data)
 	dprintk(1,"thread started\n");
 
 	mod_timer(&dma_q->timeout, jiffies+BUFFER_TIMEOUT);
+	set_freezable();
 
 	for (;;) {
 		vivi_sleep(dma_q);
diff --git a/drivers/mfd/ucb1x00-ts.c b/drivers/mfd/ucb1x00-ts.c
index 38e815a2e871..fdbaa776f249 100644
--- a/drivers/mfd/ucb1x00-ts.c
+++ b/drivers/mfd/ucb1x00-ts.c
@@ -209,6 +209,7 @@ static int ucb1x00_thread(void *_ts)
 	DECLARE_WAITQUEUE(wait, tsk);
 	int valid = 0;
 
+	set_freezable();
 	add_wait_queue(&ts->irq_wait, &wait);
 	while (!kthread_should_stop()) {
 		unsigned int x, y, p;
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index 4fb2089dc690..b53dac8d1b69 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -11,6 +11,7 @@
  */
 #include <linux/module.h>
 #include <linux/blkdev.h>
+#include <linux/freezer.h>
 #include <linux/kthread.h>
 
 #include <linux/mmc/card.h>
@@ -44,11 +45,7 @@ static int mmc_queue_thread(void *d)
 	struct mmc_queue *mq = d;
 	struct request_queue *q = mq->queue;
 
-	/*
-	 * Set iothread to ensure that we aren't put to sleep by
-	 * the process freezing.  We handle suspension ourselves.
-	 */
-	current->flags |= PF_MEMALLOC|PF_NOFREEZE;
+	current->flags |= PF_MEMALLOC;
 
 	down(&mq->thread_sem);
 	do {
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 51bc7e2f1f22..ef89780eb9d6 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -16,6 +16,7 @@
 #include <linux/mtd/mtd.h>
 #include <linux/blkdev.h>
 #include <linux/blkpg.h>
+#include <linux/freezer.h>
 #include <linux/spinlock.h>
 #include <linux/hdreg.h>
 #include <linux/init.h>
@@ -80,7 +81,7 @@ static int mtd_blktrans_thread(void *arg)
 	struct request_queue *rq = tr->blkcore_priv->rq;
 
 	/* we might get involved when memory gets low, so use PF_MEMALLOC */
-	current->flags |= PF_MEMALLOC | PF_NOFREEZE;
+	current->flags |= PF_MEMALLOC;
 
 	spin_lock_irq(rq->queue_lock);
 	while (!kthread_should_stop()) {
diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c
index 9ecaf77eca9e..ab2174a56bc2 100644
--- a/drivers/mtd/ubi/wl.c
+++ b/drivers/mtd/ubi/wl.c
@@ -1346,6 +1346,7 @@ static int ubi_thread(void *u)
 	ubi_msg("background thread \"%s\" started, PID %d",
 		ubi->bgt_name, current->pid);
 
+	set_freezable();
 	for (;;) {
 		int err;
 
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index 1c54908fdc4c..ee1cc14db389 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -3086,7 +3086,8 @@ static int airo_thread(void *data) {
 	struct net_device *dev = data;
 	struct airo_info *ai = dev->priv;
 	int locked;
-	
+
+	set_freezable();
 	while(1) {
 		/* make swsusp happy with our thread */
 		try_to_freeze();
diff --git a/drivers/net/wireless/libertas/main.c b/drivers/net/wireless/libertas/main.c
index 4a59306a3f05..9f366242c392 100644
--- a/drivers/net/wireless/libertas/main.c
+++ b/drivers/net/wireless/libertas/main.c
@@ -613,6 +613,7 @@ static int wlan_service_main_thread(void *data)
 
 	init_waitqueue_entry(&wait, current);
 
+	set_freezable();
 	for (;;) {
 		lbs_deb_thread( "main-thread 111: intcounter=%d "
 		       "currenttxskb=%p dnld_sent=%d\n",
diff --git a/drivers/pcmcia/cs.c b/drivers/pcmcia/cs.c
index 50cad3a59a6c..7c93a108f9b8 100644
--- a/drivers/pcmcia/cs.c
+++ b/drivers/pcmcia/cs.c
@@ -651,6 +651,7 @@ static int pccardd(void *__skt)
 	add_wait_queue(&skt->thread_wait, &wait);
 	complete(&skt->thread_done);
 
+	set_freezable();
 	for (;;) {
 		unsigned long flags;
 		unsigned int events;
diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c
index 3a201b77b963..03baf1c64a2e 100644
--- a/drivers/pnp/pnpbios/core.c
+++ b/drivers/pnp/pnpbios/core.c
@@ -160,6 +160,7 @@ static int pnp_dock_thread(void * unused)
 {
 	static struct pnp_docking_station_info now;
 	int docked = -1, d = 0;
+	set_freezable();
 	while (!unloading)
 	{
 		int status;
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index d70ddfda93fc..9c5342e7a69c 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -40,6 +40,7 @@
 
 #include <linux/err.h>
 #include <linux/blkdev.h>
+#include <linux/freezer.h>
 #include <linux/scatterlist.h>
 
 /* ---------- SCSI Host glue ---------- */
@@ -868,8 +869,6 @@ static int sas_queue_thread(void *_sas_ha)
 {
 	struct sas_ha_struct *sas_ha = _sas_ha;
 
-	current->flags |= PF_NOFREEZE;
-
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule();
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 9adb64ac054c..8a525abda30f 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -19,6 +19,7 @@
 #include <linux/timer.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/interrupt.h>
 #include <linux/blkdev.h>
@@ -1516,8 +1517,6 @@ int scsi_error_handler(void *data)
 {
 	struct Scsi_Host *shost = data;
 
-	current->flags |= PF_NOFREEZE;
-
 	/*
 	 * We use TASK_INTERRUPTIBLE so that the thread is not
 	 * counted against the load average as a running process.
diff --git a/drivers/usb/atm/ueagle-atm.c b/drivers/usb/atm/ueagle-atm.c
index 4973e147bc79..8f046659b4e9 100644
--- a/drivers/usb/atm/ueagle-atm.c
+++ b/drivers/usb/atm/ueagle-atm.c
@@ -1168,6 +1168,7 @@ static int uea_kthread(void *data)
 	struct uea_softc *sc = data;
 	int ret = -EAGAIN;
 
+	set_freezable();
 	uea_enters(INS_TO_USBDEV(sc));
 	while (!kthread_should_stop()) {
 		if (ret < 0 || sc->reset)
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 50e79010401c..fd74c50b1804 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2728,6 +2728,7 @@ loop:
 
 static int hub_thread(void *__unused)
 {
+	set_freezable();
 	do {
 		hub_events();
 		wait_event_interruptible(khubd_wait,
diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c
index 8712ef987179..be7a1bd2823b 100644
--- a/drivers/usb/gadget/file_storage.c
+++ b/drivers/usb/gadget/file_storage.c
@@ -3434,6 +3434,9 @@ static int fsg_main_thread(void *fsg_)
 	allow_signal(SIGKILL);
 	allow_signal(SIGUSR1);
 
+	/* Allow the thread to be frozen */
+	set_freezable();
+
 	/* Arrange for userspace references to be interpreted as kernel
 	 * pointers.  That way we can pass a kernel pointer to a routine
 	 * that expects a __user pointer and it will work okay. */
diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c
index bef8bcd9bd98..28842d208bb0 100644
--- a/drivers/usb/storage/usb.c
+++ b/drivers/usb/storage/usb.c
@@ -311,8 +311,6 @@ static int usb_stor_control_thread(void * __us)
 	struct Scsi_Host *host = us_to_host(us);
 	int autopm_rc;
 
-	current->flags |= PF_NOFREEZE;
-
 	for(;;) {
 		US_DEBUGP("*** thread sleeping.\n");
 		if(down_interruptible(&us->sema))
@@ -920,6 +918,7 @@ static int usb_stor_scan_thread(void * __us)
 	printk(KERN_DEBUG
 		"usb-storage: device found at %d\n", us->pusb_dev->devnum);
 
+	set_freezable();
 	/* Wait for the timeout to expire or for a disconnect */
 	if (delay_use > 0) {
 		printk(KERN_DEBUG "usb-storage: waiting for device "
diff --git a/drivers/video/ps3fb.c b/drivers/video/ps3fb.c
index 08b7ffbbbbd8..3972aa8cf859 100644
--- a/drivers/video/ps3fb.c
+++ b/drivers/video/ps3fb.c
@@ -812,6 +812,7 @@ static int ps3fb_ioctl(struct fb_info *info, unsigned int cmd,
 
 static int ps3fbd(void *arg)
 {
+	set_freezable();
 	while (!kthread_should_stop()) {
 		try_to_freeze();
 		set_current_state(TASK_INTERRUPTIBLE);
diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c
index f5c5b760ed7b..c6332108f1c5 100644
--- a/drivers/w1/w1.c
+++ b/drivers/w1/w1.c
@@ -805,6 +805,7 @@ static int w1_control(void *data)
 	struct w1_master *dev, *n;
 	int have_to_wait = 0;
 
+	set_freezable();
 	while (!kthread_should_stop() || have_to_wait) {
 		have_to_wait = 0;
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8b0cbf4a4ad0..bd0f2f2353ce 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -849,6 +849,7 @@ static int cifs_oplock_thread(void * dummyarg)
 	__u16  netfid;
 	int rc;
 
+	set_freezable();
 	do {
 		if (try_to_freeze()) 
 			continue;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f4e92661b223..0a1b8bd1dfcb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -363,6 +363,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 			GFP_KERNEL);
 	}
 
+	set_freezable();
 	while (!kthread_should_stop()) {
 		if (try_to_freeze())
 			continue;
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 0c82dfcfd246..143c5530caf3 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -81,6 +81,7 @@ static int jffs2_garbage_collect_thread(void *_c)
 
 	set_user_nice(current, 10);
 
+	set_freezable();
 	for (;;) {
 		allow_signal(SIGHUP);
 
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 26809325469c..9fcdef50aad6 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -25,6 +25,7 @@
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/mutex.h>
+#include <linux/freezer.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/stats.h>
@@ -119,6 +120,7 @@ lockd(struct svc_rqst *rqstp)
 	complete(&lockd_start_done);
 
 	daemonize("lockd");
+	set_freezable();
 
 	/* Process request with signals blocked, but allow SIGKILL.  */
 	allow_signal(SIGKILL);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 75f309c8741a..a796be5051bf 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -14,6 +14,7 @@
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfs_fs.h>
 #include <linux/mutex.h>
+#include <linux/freezer.h>
 
 #include <net/inet_sock.h>
 
@@ -67,6 +68,7 @@ static void nfs_callback_svc(struct svc_rqst *rqstp)
 	daemonize("nfsv4-svc");
 	/* Process request with signals blocked, but allow SIGKILL.  */
 	allow_signal(SIGKILL);
+	set_freezable();
 
 	complete(&nfs_callback_info.started);
 
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index ff55950efb43..5c8192bcbced 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -19,6 +19,7 @@
 #include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
+#include <linux/freezer.h>
 #include <linux/fs_struct.h>
 
 #include <linux/sunrpc/types.h>
@@ -432,6 +433,7 @@ nfsd(struct svc_rqst *rqstp)
 	 * dirty pages.
 	 */
 	current->flags |= PF_LESS_THROTTLE;
+	set_freezable();
 
 	/*
 	 * The main request loop
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 06894cf00b12..4528f9a3f304 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -562,6 +562,7 @@ xfssyncd(
 	bhv_vfs_sync_work_t	*work, *n;
 	LIST_HEAD		(tmp);
 
+	set_freezable();
 	timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
 	for (;;) {
 		timeleft = schedule_timeout_interruptible(timeleft);
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 4631086f5060..2d38b1a74662 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -1,5 +1,8 @@
 /* Freezer declarations */
 
+#ifndef FREEZER_H_INCLUDED
+#define FREEZER_H_INCLUDED
+
 #include <linux/sched.h>
 
 #ifdef CONFIG_PM
@@ -115,6 +118,14 @@ static inline int freezer_should_skip(struct task_struct *p)
 	return !!(p->flags & PF_FREEZER_SKIP);
 }
 
+/*
+ * Tell the freezer that the current task should be frozen by it
+ */
+static inline void set_freezable(void)
+{
+	current->flags &= ~PF_NOFREEZE;
+}
+
 #else
 static inline int frozen(struct task_struct *p) { return 0; }
 static inline int freezing(struct task_struct *p) { return 0; }
@@ -130,4 +141,7 @@ static inline int try_to_freeze(void) { return 0; }
 static inline void freezer_do_not_count(void) {}
 static inline void freezer_count(void) {}
 static inline int freezer_should_skip(struct task_struct *p) { return 0; }
+static inline void set_freezable(void) {}
 #endif
+
+#endif	/* FREEZER_H_INCLUDED */
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index b222ce9e1c8b..a6b4c0c08e13 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -56,12 +56,9 @@ static void __init handle_initrd(void)
 	sys_chroot(".");
 
 	pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD);
-	if (pid > 0) {
-		while (pid != sys_wait4(-1, NULL, 0, NULL)) {
-			try_to_freeze();
+	if (pid > 0)
+		while (pid != sys_wait4(-1, NULL, 0, NULL))
 			yield();
-		}
-	}
 
 	/* move initrd to rootfs' /old */
 	sys_fchdir(old_fd);
diff --git a/kernel/audit.c b/kernel/audit.c
index 5ce8851facf7..eb0f9165b401 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -392,6 +392,7 @@ static int kauditd_thread(void *dummy)
 {
 	struct sk_buff *skb;
 
+	set_freezable();
 	while (!kthread_should_stop()) {
 		skb = skb_dequeue(&audit_skb_queue);
 		wake_up(&audit_backlog_wait);
diff --git a/kernel/exit.c b/kernel/exit.c
index 57626692cd90..e8af8d0c2483 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -31,6 +31,7 @@
 #include <linux/mempolicy.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
+#include <linux/freezer.h>
 #include <linux/cpuset.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
@@ -387,6 +388,11 @@ void daemonize(const char *name, ...)
 	 * they would be locked into memory.
 	 */
 	exit_mm(current);
+	/*
+	 * We don't want to have TIF_FREEZE set if the system-wide hibernation
+	 * or suspend transition begins right now.
+	 */
+	current->flags |= PF_NOFREEZE;
 
 	set_special_pids(1, 1);
 	proc_clear_tty(current);
diff --git a/kernel/fork.c b/kernel/fork.c
index 7c5c5888e00a..ba39bdb2a7b8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -923,7 +923,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
 	unsigned long new_flags = p->flags;
 
-	new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
+	new_flags &= ~PF_SUPERPRIV;
 	new_flags |= PF_FORKNOEXEC;
 	if (!(clone_flags & CLONE_PTRACE))
 		p->ptrace = 0;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 55ba82a85a66..ddff33247785 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -40,6 +40,7 @@
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
+#include <linux/freezer.h>
 #include <linux/cpu.h>
 #include <linux/random.h>
 #include <linux/delay.h>
@@ -518,7 +519,6 @@ rcu_torture_writer(void *arg)
 
 	VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
 	set_user_nice(current, 19);
-	current->flags |= PF_NOFREEZE;
 
 	do {
 		schedule_timeout_uninterruptible(1);
@@ -558,7 +558,6 @@ rcu_torture_fakewriter(void *arg)
 
 	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
 	set_user_nice(current, 19);
-	current->flags |= PF_NOFREEZE;
 
 	do {
 		schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
@@ -589,7 +588,6 @@ rcu_torture_reader(void *arg)
 
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
 	set_user_nice(current, 19);
-	current->flags |= PF_NOFREEZE;
 
 	do {
 		idx = cur_ops->readlock();
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 015fc633c96c..e3055ba69159 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -260,6 +260,7 @@ static int test_func(void *data)
 	int ret;
 
 	current->flags |= PF_MUTEX_TESTER;
+	set_freezable();
 	allow_signal(SIGHUP);
 
 	for(;;) {
diff --git a/kernel/sched.c b/kernel/sched.c
index 1c8076676eb1..cb31fb4a1379 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4912,8 +4912,6 @@ static int migration_thread(void *data)
 		struct migration_req *req;
 		struct list_head *head;
 
-		try_to_freeze();
-
 		spin_lock_irq(&rq->lock);
 
 		if (cpu_is_offline(cpu)) {
@@ -5147,7 +5145,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
 		if (IS_ERR(p))
 			return NOTIFY_BAD;
-		p->flags |= PF_NOFREEZE;
 		kthread_bind(p, cpu);
 		/* Must be high prio: stop_machine expects to yield to it. */
 		rq = task_rq_lock(p, &flags);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8de267790166..0f546ddea43d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -14,6 +14,7 @@
 #include <linux/notifier.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
+#include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
 #include <linux/smp.h>
@@ -488,8 +489,6 @@ void __init softirq_init(void)
 
 static int ksoftirqd(void * __bind_cpu)
 {
-	current->flags |= PF_NOFREEZE;
-
 	set_current_state(TASK_INTERRUPTIBLE);
 
 	while (!kthread_should_stop()) {
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 0131e296ffb4..708d4882c0c3 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -10,6 +10,7 @@
 #include <linux/cpu.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/notifier.h>
 #include <linux/module.h>
@@ -116,7 +117,6 @@ static int watchdog(void * __bind_cpu)
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
-	current->flags |= PF_NOFREEZE;
 
 	/* initialize timestamp */
 	touch_softlockup_watchdog();
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d7d3fa3072e5..1935302cc645 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -282,8 +282,8 @@ static int worker_thread(void *__cwq)
 	struct cpu_workqueue_struct *cwq = __cwq;
 	DEFINE_WAIT(wait);
 
-	if (!cwq->wq->freezeable)
-		current->flags |= PF_NOFREEZE;
+	if (cwq->wq->freezeable)
+		set_freezable();
 
 	set_user_nice(current, -5);
 
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 8ce0900dc95c..8f6ee073c0e3 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -92,6 +92,7 @@ struct pdflush_work {
 static int __pdflush(struct pdflush_work *my_work)
 {
 	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+	set_freezable();
 	my_work->fn = NULL;
 	my_work->who = current;
 	INIT_LIST_HEAD(&my_work->list);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2225b7c9df85..d419e10e3daa 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1421,6 +1421,7 @@ static int kswapd(void *p)
 	 * trying to free the first piece of memory in the first place).
 	 */
 	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+	set_freezable();
 
 	order = 0;
 	for ( ; ; ) {
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
index 1c8f4a0c5f43..1f78c3e336d8 100644
--- a/net/bluetooth/bnep/core.c
+++ b/net/bluetooth/bnep/core.c
@@ -36,6 +36,7 @@
 #include <linux/signal.h>
 #include <linux/init.h>
 #include <linux/wait.h>
+#include <linux/freezer.h>
 #include <linux/errno.h>
 #include <linux/net.h>
 #include <net/sock.h>
@@ -474,7 +475,6 @@ static int bnep_session(void *arg)
 
 	daemonize("kbnepd %s", dev->name);
 	set_user_nice(current, -15);
-	current->flags |= PF_NOFREEZE;
 
 	init_waitqueue_entry(&wait, current);
 	add_wait_queue(sk->sk_sleep, &wait);
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 66bef1ccee2a..ca60a4517fd3 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -29,6 +29,7 @@
 #include <linux/slab.h>
 #include <linux/poll.h>
 #include <linux/fcntl.h>
+#include <linux/freezer.h>
 #include <linux/skbuff.h>
 #include <linux/socket.h>
 #include <linux/ioctl.h>
@@ -287,7 +288,6 @@ static int cmtp_session(void *arg)
 
 	daemonize("kcmtpd_ctr_%d", session->num);
 	set_user_nice(current, -15);
-	current->flags |= PF_NOFREEZE;
 
 	init_waitqueue_entry(&wait, current);
 	add_wait_queue(sk->sk_sleep, &wait);
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 450eb0244bbf..64d89ca28847 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -28,6 +28,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/poll.h>
+#include <linux/freezer.h>
 #include <linux/fcntl.h>
 #include <linux/skbuff.h>
 #include <linux/socket.h>
@@ -547,7 +548,6 @@ static int hidp_session(void *arg)
 
 	daemonize("khidpd_%04x%04x", vendor, product);
 	set_user_nice(current, -15);
-	current->flags |= PF_NOFREEZE;
 
 	init_waitqueue_entry(&ctrl_wait, current);
 	init_waitqueue_entry(&intr_wait, current);
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 52e04df323ea..bb7220770f2c 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -33,6 +33,7 @@
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/init.h>
+#include <linux/freezer.h>
 #include <linux/wait.h>
 #include <linux/device.h>
 #include <linux/net.h>
@@ -1940,7 +1941,6 @@ static int rfcomm_run(void *unused)
 
 	daemonize("krfcommd");
 	set_user_nice(current, -10);
-	current->flags |= PF_NOFREEZE;
 
 	BT_DBG("");
 
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 75215331b045..bca787fdbc51 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3465,6 +3465,8 @@ static int pktgen_thread_worker(void *arg)
 
 	set_current_state(TASK_INTERRUPTIBLE);
 
+	set_freezable();
+
 	while (!kthread_should_stop()) {
 		pkt_dev = next_to_run(t);
 
-- 
cgit v1.3-8-gc7d7


From 2a7326b5bbafac4c96bcdb944b2a773593030b96 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Tue, 17 Jul 2007 04:03:37 -0700
Subject: CONFIG_BOUNCE to avoid useless inclusion of bounce buffer logic

The bounce buffer logic is included on systems that do not need it.  If a
system does not have zones like ZONE_DMA and ZONE_HIGHMEM that can lead to
the use of bounce buffers then there is no need to reserve memory pools etc
etc.  This is true f.e.  for SGI Altix.

Also nicifies the Makefile and gets rid of the tricky "and" there.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/blkdev.h | 2 +-
 mm/Kconfig             | 4 ++++
 mm/Makefile            | 4 +---
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b32564a1e105..f78965fc6426 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -624,7 +624,7 @@ extern unsigned long blk_max_low_pfn, blk_max_pfn;
  */
 #define BLK_DEFAULT_SG_TIMEOUT	(60 * HZ)
 
-#ifdef CONFIG_MMU
+#ifdef CONFIG_BOUNCE
 extern int init_emergency_isa_pool(void);
 extern void blk_queue_bounce(request_queue_t *q, struct bio **bio);
 #else
diff --git a/mm/Kconfig b/mm/Kconfig
index 086af703da43..86187221e78f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -163,6 +163,10 @@ config ZONE_DMA_FLAG
 	default "0" if !ZONE_DMA
 	default "1"
 
+config BOUNCE
+	def_bool y
+	depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
+
 config NR_QUICK
 	int
 	depends on QUICKLIST
diff --git a/mm/Makefile b/mm/Makefile
index a9148ea329aa..245e33ab00c4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -13,9 +13,7 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
 			   $(mmu-y)
 
-ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
-obj-y			+= bounce.o
-endif
+obj-$(CONFIG_BOUNCE)	+= bounce.o
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
-- 
cgit v1.3-8-gc7d7


From bcdcd8e725b923ad7c0de809680d5d5658a7bf8c Mon Sep 17 00:00:00 2001
From: Pavel Emelianov <xemul@openvz.org>
Date: Tue, 17 Jul 2007 04:03:42 -0700
Subject: Report that kernel is tainted if there was an OOPS

If the kernel OOPSed or BUGed then it probably should be considered as
tainted.  Thus, all subsequent OOPSes and SysRq dumps will report the
tainted kernel.  This saves a lot of time explaining oddities in the
calltraces.

Signed-off-by: Pavel Emelianov <xemul@openvz.org>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[ Added parisc patch from Matthew Wilson  -Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/oops-tracing.txt | 2 ++
 arch/alpha/kernel/traps.c      | 1 +
 arch/arm/kernel/traps.c        | 1 +
 arch/arm26/kernel/traps.c      | 1 +
 arch/avr32/kernel/traps.c      | 1 +
 arch/i386/kernel/traps.c       | 1 +
 arch/ia64/kernel/traps.c       | 1 +
 arch/m68k/kernel/traps.c       | 1 +
 arch/m68knommu/kernel/traps.c  | 1 +
 arch/mips/kernel/traps.c       | 1 +
 arch/parisc/kernel/traps.c     | 1 +
 arch/powerpc/kernel/traps.c    | 1 +
 arch/ppc/kernel/traps.c        | 1 +
 arch/s390/kernel/traps.c       | 1 +
 arch/sh/kernel/traps.c         | 1 +
 arch/sparc/kernel/traps.c      | 1 +
 arch/sparc64/kernel/traps.c    | 1 +
 arch/x86_64/kernel/traps.c     | 1 +
 arch/xtensa/kernel/traps.c     | 1 +
 include/linux/kernel.h         | 1 +
 kernel/panic.c                 | 5 +++--
 21 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt
index 23e6dde7eea6..7f60dfe642ca 100644
--- a/Documentation/oops-tracing.txt
+++ b/Documentation/oops-tracing.txt
@@ -251,6 +251,8 @@ characters, each representing a particular tainted value.
   7: 'U' if a user or user application specifically requested that the
      Tainted flag be set, ' ' otherwise.
 
+  8: 'D' if the kernel has died recently, i.e. there was an OOPS or BUG.
+
 The primary reason for the 'Tainted: ' string is to tell kernel
 debuggers if this is a clean kernel or if anything unusual has
 occurred.  Tainting is permanent: even if an offending module is
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index d6e665d567bd..ec0f05e0d8ff 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -184,6 +184,7 @@ die_if_kernel(char * str, struct pt_regs *regs, long err, unsigned long *r9_15)
 #endif
 	printk("%s(%d): %s %ld\n", current->comm, current->pid, str, err);
 	dik_show_regs(regs, r9_15);
+	add_taint(TAINT_DIE);
 	dik_show_trace((unsigned long *)(regs+1));
 	dik_show_code((unsigned int *)regs->pc);
 
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 237f4999b9a1..f2114bcf09d5 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -249,6 +249,7 @@ NORET_TYPE void die(const char *str, struct pt_regs *regs, int err)
 	bust_spinlocks(1);
 	__die(str, err, thread, regs);
 	bust_spinlocks(0);
+	add_taint(TAINT_DIE);
 	spin_unlock_irq(&die_lock);
 
 	if (in_interrupt())
diff --git a/arch/arm26/kernel/traps.c b/arch/arm26/kernel/traps.c
index d594fb59e945..2911e2eae80e 100644
--- a/arch/arm26/kernel/traps.c
+++ b/arch/arm26/kernel/traps.c
@@ -185,6 +185,7 @@ NORET_TYPE void die(const char *str, struct pt_regs *regs, int err)
 	printk("Internal error: %s: %x\n", str, err);
 	printk("CPU: %d\n", smp_processor_id());
 	show_regs(regs);
+	add_taint(TAINT_DIE);
 	printk("Process %s (pid: %d, stack limit = 0x%p)\n",
 		current->comm, current->pid, end_of_stack(tsk));
 
diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c
index aaa792815cd7..9a73ce7eb50f 100644
--- a/arch/avr32/kernel/traps.c
+++ b/arch/avr32/kernel/traps.c
@@ -56,6 +56,7 @@ void NORET_TYPE die(const char *str, struct pt_regs *regs, long err)
 	show_regs_log_lvl(regs, KERN_EMERG);
 	show_stack_log_lvl(current, regs->sp, regs, KERN_EMERG);
 	bust_spinlocks(0);
+	add_taint(TAINT_DIE);
 	spin_unlock_irq(&die_lock);
 
 	if (in_interrupt())
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 28bd1c5163ec..18c1c285836d 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -433,6 +433,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 
 	bust_spinlocks(0);
 	die.lock_owner = -1;
+	add_taint(TAINT_DIE);
 	spin_unlock_irqrestore(&die.lock, flags);
 
 	if (!regs)
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index 15ad85da15a9..3aeaf15e468b 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -69,6 +69,7 @@ die (const char *str, struct pt_regs *regs, long err)
 
 	bust_spinlocks(0);
 	die.lock_owner = -1;
+	add_taint(TAINT_DIE);
 	spin_unlock_irq(&die.lock);
 
 	if (panic_on_oops)
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index a27a4fa33296..4e2752a0e89b 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -1170,6 +1170,7 @@ void die_if_kernel (char *str, struct pt_regs *fp, int nr)
 	console_verbose();
 	printk("%s: %08x\n",str,nr);
 	show_registers(fp);
+	add_taint(TAINT_DIE);
 	do_exit(SIGSEGV);
 }
 
diff --git a/arch/m68knommu/kernel/traps.c b/arch/m68knommu/kernel/traps.c
index bed5f47bf568..fde04e1757f7 100644
--- a/arch/m68knommu/kernel/traps.c
+++ b/arch/m68knommu/kernel/traps.c
@@ -83,6 +83,7 @@ void die_if_kernel(char *str, struct pt_regs *fp, int nr)
 	printk(KERN_EMERG "Process %s (pid: %d, stackpage=%08lx)\n",
 		current->comm, current->pid, PAGE_SIZE+(unsigned long)current);
 	show_stack(NULL, (unsigned long *)fp);
+	add_taint(TAINT_DIE);
 	do_exit(SIGSEGV);
 }
 
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 37c562c4c817..ce277cb34dd0 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -326,6 +326,7 @@ void __noreturn die(const char * str, struct pt_regs * regs)
 #endif /* CONFIG_MIPS_MT_SMTC */
 	printk("%s[#%d]:\n", str, ++die_counter);
 	show_registers(regs);
+	add_taint(TAINT_DIE);
 	spin_unlock_irq(&die_lock);
 
 	if (in_interrupt())
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index f9bca2d74b38..bbf029a184ac 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -264,6 +264,7 @@ KERN_CRIT "                     ||     ||\n");
 
 	show_regs(regs);
 	dump_stack();
+	add_taint(TAINT_DIE);
 
 	if (in_interrupt())
 		panic("Fatal exception in interrupt");
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 3b8427e6283d..2bb1cb911783 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -149,6 +149,7 @@ int die(const char *str, struct pt_regs *regs, long err)
 
 	bust_spinlocks(0);
 	die.lock_owner = -1;
+	add_taint(TAINT_DIE);
 	spin_unlock_irqrestore(&die.lock, flags);
 
 	if (kexec_should_crash(current) ||
diff --git a/arch/ppc/kernel/traps.c b/arch/ppc/kernel/traps.c
index 0eaef7c8378b..3f3b292eb773 100644
--- a/arch/ppc/kernel/traps.c
+++ b/arch/ppc/kernel/traps.c
@@ -92,6 +92,7 @@ int die(const char * str, struct pt_regs * fp, long err)
 	if (nl)
 		printk("\n");
 	show_regs(fp);
+	add_taint(TAINT_DIE);
 	spin_unlock_irq(&die_lock);
 	/* do_exit() should take care of panic'ing from an interrupt
 	 * context so we don't handle it here
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index 81e03b9c3841..8ec9def83ccb 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -262,6 +262,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 	print_modules();
 	show_regs(regs);
 	bust_spinlocks(0);
+	add_taint(TAINT_DIE);
 	spin_unlock_irq(&die_lock);
 	if (in_interrupt())
 		panic("Fatal exception in interrupt");
diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c
index 05a40f3c30bf..502d43e4785c 100644
--- a/arch/sh/kernel/traps.c
+++ b/arch/sh/kernel/traps.c
@@ -103,6 +103,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 			 (unsigned long)task_stack_page(current));
 
 	bust_spinlocks(0);
+	add_taint(TAINT_DIE);
 	spin_unlock_irq(&die_lock);
 
 	if (kexec_should_crash(current))
diff --git a/arch/sparc/kernel/traps.c b/arch/sparc/kernel/traps.c
index dc9ffea2a4f7..3bc3bff51e08 100644
--- a/arch/sparc/kernel/traps.c
+++ b/arch/sparc/kernel/traps.c
@@ -101,6 +101,7 @@ void die_if_kernel(char *str, struct pt_regs *regs)
 
 	printk("%s(%d): %s [#%d]\n", current->comm, current->pid, str, ++die_counter);
 	show_regs(regs);
+	add_taint(TAINT_DIE);
 
 	__SAVE; __SAVE; __SAVE; __SAVE;
 	__SAVE; __SAVE; __SAVE; __SAVE;
diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c
index 00a9e3286c83..6ef2d299fb10 100644
--- a/arch/sparc64/kernel/traps.c
+++ b/arch/sparc64/kernel/traps.c
@@ -2225,6 +2225,7 @@ void die_if_kernel(char *str, struct pt_regs *regs)
 	notify_die(DIE_OOPS, str, regs, 0, 255, SIGSEGV);
 	__asm__ __volatile__("flushw");
 	__show_regs(regs);
+	add_taint(TAINT_DIE);
 	if (regs->tstate & TSTATE_PRIV) {
 		struct reg_window *rw = (struct reg_window *)
 			(regs->u_regs[UREG_FP] + STACK_BIAS);
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 7fa155c394d9..6963b64a76ca 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -518,6 +518,7 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err)
 	printk("\n");
 	notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
 	show_registers(regs);
+	add_taint(TAINT_DIE);
 	/* Executive summary in case the oops scrolled away */
 	printk(KERN_ALERT "RIP ");
 	printk_address(regs->rip); 
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index 693ab268485e..c5e62f9d9f50 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -482,6 +482,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 	if (!user_mode(regs))
 		show_stack(NULL, (unsigned long*)regs->areg[1]);
 
+	add_taint(TAINT_DIE);
 	spin_unlock_irq(&die_lock);
 
 	if (in_interrupt())
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7a4852505914..1eb9cde550c4 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -210,6 +210,7 @@ extern enum system_states {
 #define TAINT_MACHINE_CHECK		(1<<4)
 #define TAINT_BAD_PAGE			(1<<5)
 #define TAINT_USER			(1<<6)
+#define TAINT_DIE			(1<<7)
 
 extern void dump_stack(void);
 
diff --git a/kernel/panic.c b/kernel/panic.c
index 623d1828259a..f64f4c1ac11f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -159,14 +159,15 @@ const char *print_tainted(void)
 {
 	static char buf[20];
 	if (tainted) {
-		snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c",
+		snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c",
 			tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
 			tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
 			tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
 			tainted & TAINT_FORCED_RMMOD ? 'R' : ' ',
  			tainted & TAINT_MACHINE_CHECK ? 'M' : ' ',
 			tainted & TAINT_BAD_PAGE ? 'B' : ' ',
-			tainted & TAINT_USER ? 'U' : ' ');
+			tainted & TAINT_USER ? 'U' : ' ',
+			tainted & TAINT_DIE ? 'D' : ' ');
 	}
 	else
 		snprintf(buf, sizeof(buf), "Not tainted");
-- 
cgit v1.3-8-gc7d7


From 7664732315c97f48dba9d1e7339ad16fc5a320ac Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 17 Jul 2007 04:03:43 -0700
Subject: PTRACE_PEEKDATA consolidation

Identical implementations of PTRACE_PEEKDATA go into generic_ptrace_peekdata()
function.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/ptrace.c           |  8 +-------
 arch/arm26/kernel/ptrace.c         |  8 +-------
 arch/avr32/kernel/ptrace.c         |  7 +------
 arch/cris/arch-v10/kernel/ptrace.c | 14 ++------------
 arch/frv/kernel/ptrace.c           | 12 ++----------
 arch/i386/kernel/ptrace.c          | 12 ++----------
 arch/m32r/kernel/ptrace.c          |  7 +------
 arch/m68k/kernel/ptrace.c          |  5 +----
 arch/m68knommu/kernel/ptrace.c     | 12 ++----------
 arch/mips/kernel/ptrace.c          | 12 ++----------
 arch/parisc/kernel/ptrace.c        | 13 ++-----------
 arch/powerpc/kernel/ptrace.c       | 12 ++----------
 arch/s390/kernel/ptrace.c          |  6 +-----
 arch/sh/kernel/ptrace.c            | 13 ++-----------
 arch/sh64/kernel/ptrace.c          | 12 ++----------
 arch/um/kernel/ptrace.c            | 12 ++----------
 arch/v850/kernel/ptrace.c          |  8 ++------
 arch/x86_64/kernel/ptrace.c        | 12 ++----------
 arch/xtensa/kernel/ptrace.c        | 12 +-----------
 include/linux/ptrace.h             |  1 +
 kernel/ptrace.c                    | 11 +++++++++++
 21 files changed, 43 insertions(+), 166 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 6f2f46c2e406..9a5d9754c2a2 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -657,7 +657,6 @@ static int ptrace_setcrunchregs(struct task_struct *tsk, void __user *ufp)
 
 long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	unsigned long tmp;
 	int ret;
 
 	switch (request) {
@@ -666,12 +665,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		 */
 		case PTRACE_PEEKTEXT:
 		case PTRACE_PEEKDATA:
-			ret = access_process_vm(child, addr, &tmp,
-						sizeof(unsigned long), 0);
-			if (ret == sizeof(unsigned long))
-				ret = put_user(tmp, (unsigned long __user *) data);
-			else
-				ret = -EIO;
+			ret = generic_ptrace_peekdata(child, addr, data);
 			break;
 
 		case PTRACE_PEEKUSR:
diff --git a/arch/arm26/kernel/ptrace.c b/arch/arm26/kernel/ptrace.c
index 416927956721..0d0ead0e0a74 100644
--- a/arch/arm26/kernel/ptrace.c
+++ b/arch/arm26/kernel/ptrace.c
@@ -531,7 +531,6 @@ static int ptrace_setfpregs(struct task_struct *tsk, void *ufp)
 
 long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	unsigned long tmp;
 	int ret;
 
 	switch (request) {
@@ -540,12 +539,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		 */
 		case PTRACE_PEEKTEXT:
 		case PTRACE_PEEKDATA:
-			ret = access_process_vm(child, addr, &tmp,
-						sizeof(unsigned long), 0);
-			if (ret == sizeof(unsigned long))
-				ret = put_user(tmp, (unsigned long *) data);
-			else
-				ret = -EIO;
+			ret = generic_ptrace_peekdata(child, addr, data);
 			break;
 
 		case PTRACE_PEEKUSR:
diff --git a/arch/avr32/kernel/ptrace.c b/arch/avr32/kernel/ptrace.c
index 3c36c2d16148..1043fdc2df7f 100644
--- a/arch/avr32/kernel/ptrace.c
+++ b/arch/avr32/kernel/ptrace.c
@@ -153,7 +153,6 @@ static int ptrace_setregs(struct task_struct *tsk, const void __user *uregs)
 
 long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	unsigned long tmp;
 	int ret;
 
 	pr_debug("arch_ptrace(%ld, %d, %#lx, %#lx)\n",
@@ -166,11 +165,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	/* Read the word at location addr in the child process */
 	case PTRACE_PEEKTEXT:
 	case PTRACE_PEEKDATA:
-		ret = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		if (ret == sizeof(tmp))
-			ret = put_user(tmp, (unsigned long __user *)data);
-		else
-			ret = -EIO;
+		ret = generic_ptrace_peekdata(child, addr, data);
 		break;
 
 	case PTRACE_PEEKUSR:
diff --git a/arch/cris/arch-v10/kernel/ptrace.c b/arch/cris/arch-v10/kernel/ptrace.c
index fd2129a04586..74b1b4dc8225 100644
--- a/arch/cris/arch-v10/kernel/ptrace.c
+++ b/arch/cris/arch-v10/kernel/ptrace.c
@@ -83,19 +83,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	switch (request) {
 		/* Read word at location address. */ 
 		case PTRACE_PEEKTEXT:
-		case PTRACE_PEEKDATA: {
-			unsigned long tmp;
-			int copied;
-
-			copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-			ret = -EIO;
-			
-			if (copied != sizeof(tmp))
-				break;
-			
-			ret = put_user(tmp,datap);
+		case PTRACE_PEEKDATA:
+			ret = generic_ptrace_peekdata(child, addr, data);
 			break;
-		}
 
 		/* Read the word at location address in the USER area. */
 		case PTRACE_PEEKUSR: {
diff --git a/arch/frv/kernel/ptrace.c b/arch/frv/kernel/ptrace.c
index ce88fb95ee59..a10f3092fad4 100644
--- a/arch/frv/kernel/ptrace.c
+++ b/arch/frv/kernel/ptrace.c
@@ -112,20 +112,12 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	switch (request) {
 		/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */
-	case PTRACE_PEEKDATA: {
-		int copied;
-
+	case PTRACE_PEEKDATA:
 		ret = -EIO;
 		if (is_user_addr_valid(child, addr, sizeof(tmp)) < 0)
 			break;
-
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		if (copied != sizeof(tmp))
-			break;
-
-		ret = put_user(tmp,(unsigned long *) data);
+		ret = generic_ptrace_peekdata(child, addr, data);
 		break;
-	}
 
 		/* read the word at location addr in the USER area. */
 	case PTRACE_PEEKUSR: {
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 0c0ceec5de00..f4bcf1da662a 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -358,17 +358,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	switch (request) {
 	/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */ 
-	case PTRACE_PEEKDATA: {
-		unsigned long tmp;
-		int copied;
-
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		ret = -EIO;
-		if (copied != sizeof(tmp))
-			break;
-		ret = put_user(tmp, datap);
+	case PTRACE_PEEKDATA:
+		ret = generic_ptrace_peekdata(child, addr, data);
 		break;
-	}
 
 	/* read the word at location addr in the USER area. */
 	case PTRACE_PEEKUSR: {
diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c
index 5f02b3144875..01a1c9ac8458 100644
--- a/arch/m32r/kernel/ptrace.c
+++ b/arch/m32r/kernel/ptrace.c
@@ -595,7 +595,6 @@ void ptrace_disable(struct task_struct *child)
 static int
 do_ptrace(long request, struct task_struct *child, long addr, long data)
 {
-	unsigned long tmp;
 	int ret;
 
 	switch (request) {
@@ -604,11 +603,7 @@ do_ptrace(long request, struct task_struct *child, long addr, long data)
 	 */
 	case PTRACE_PEEKTEXT:
 	case PTRACE_PEEKDATA:
-		ret = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		if (ret == sizeof(tmp))
-			ret = put_user(tmp,(unsigned long __user *) data);
-		else
-			ret = -EIO;
+		ret = generic_ptrace_peekdata(child, addr, data);
 		break;
 
 	/*
diff --git a/arch/m68k/kernel/ptrace.c b/arch/m68k/kernel/ptrace.c
index cdba9fd6d82f..01a3a09c53d9 100644
--- a/arch/m68k/kernel/ptrace.c
+++ b/arch/m68k/kernel/ptrace.c
@@ -128,10 +128,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT:	/* read word at location addr. */
 	case PTRACE_PEEKDATA:
-		i = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		if (i != sizeof(tmp))
-			goto out_eio;
-		ret = put_user(tmp, (unsigned long *)data);
+		ret = generic_ptrace_peekdata(child, addr, data);
 		break;
 
 	/* read the word at location addr in the USER area. */
diff --git a/arch/m68knommu/kernel/ptrace.c b/arch/m68knommu/kernel/ptrace.c
index f54b6a3dfecb..f550e614aa78 100644
--- a/arch/m68knommu/kernel/ptrace.c
+++ b/arch/m68knommu/kernel/ptrace.c
@@ -106,17 +106,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	switch (request) {
 		/* when I and D space are separate, these will need to be fixed. */
 		case PTRACE_PEEKTEXT: /* read word at location addr. */ 
-		case PTRACE_PEEKDATA: {
-			unsigned long tmp;
-			int copied;
-
-			copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-			ret = -EIO;
-			if (copied != sizeof(tmp))
-				break;
-			ret = put_user(tmp,(unsigned long *) data);
+		case PTRACE_PEEKDATA:
+			ret = generic_ptrace_peekdata(child, addr, data);
 			break;
-		}
 
 		/* read the word at location addr in the USER area. */
 		case PTRACE_PEEKUSR: {
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index b5a7b46bbc49..af9d0bec8731 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -174,17 +174,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	switch (request) {
 	/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */
-	case PTRACE_PEEKDATA: {
-		unsigned long tmp;
-		int copied;
-
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		ret = -EIO;
-		if (copied != sizeof(tmp))
-			break;
-		ret = put_user(tmp,(unsigned long __user *) data);
+	case PTRACE_PEEKDATA:
+		ret = generic_ptrace_peekdata(child, addr, data);
 		break;
-	}
 
 	/* Read the word at location addr in the USER area. */
 	case PTRACE_PEEKUSR: {
diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c
index 8a0db376e91e..26ec774c5027 100644
--- a/arch/parisc/kernel/ptrace.c
+++ b/arch/parisc/kernel/ptrace.c
@@ -87,10 +87,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	switch (request) {
 	case PTRACE_PEEKTEXT: /* read word at location addr. */ 
 	case PTRACE_PEEKDATA: {
-		int copied;
-
 #ifdef CONFIG_64BIT
 		if (__is_compat_task(child)) {
+			int copied;
 			unsigned int tmp;
 
 			addr &= 0xffffffffL;
@@ -105,15 +104,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		}
 		else
 #endif
-		{
-			unsigned long tmp;
-
-			copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-			ret = -EIO;
-			if (copied != sizeof(tmp))
-				goto out_tsk;
-			ret = put_user(tmp,(unsigned long *) data);
-		}
+			ret = generic_ptrace_peekdata(child, addr, data);
 		goto out_tsk;
 	}
 
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 0fb53950da43..581d427148e7 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -379,17 +379,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	switch (request) {
 	/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */
-	case PTRACE_PEEKDATA: {
-		unsigned long tmp;
-		int copied;
-
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		ret = -EIO;
-		if (copied != sizeof(tmp))
-			break;
-		ret = put_user(tmp,(unsigned long __user *) data);
+	case PTRACE_PEEKDATA:
+		ret = generic_ptrace_peekdata(child, addr, data);
 		break;
-	}
 
 	/* read the word at location addr in the USER area. */
 	case PTRACE_PEEKUSR: {
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 2a8f0872ea8b..28afff4e5d1b 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -294,7 +294,6 @@ poke_user(struct task_struct *child, addr_t addr, addr_t data)
 static int
 do_ptrace_normal(struct task_struct *child, long request, long addr, long data)
 {
-	unsigned long tmp;
 	ptrace_area parea; 
 	int copied, ret;
 
@@ -304,10 +303,7 @@ do_ptrace_normal(struct task_struct *child, long request, long addr, long data)
 		/* Remove high order bit from address (only for 31 bit). */
 		addr &= PSW_ADDR_INSN;
 		/* read word at location addr. */
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		if (copied != sizeof(tmp))
-			return -EIO;
-		return put_user(tmp, (unsigned long __force __user *) data);
+		return generic_ptrace_peekdata(child, addr, data);
 
 	case PTRACE_PEEKUSR:
 		/* read the word at location addr in the USER area. */
diff --git a/arch/sh/kernel/ptrace.c b/arch/sh/kernel/ptrace.c
index f2eaa485d04d..f23f949576a5 100644
--- a/arch/sh/kernel/ptrace.c
+++ b/arch/sh/kernel/ptrace.c
@@ -91,17 +91,8 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	switch (request) {
 	/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */
-	case PTRACE_PEEKDATA: {
-		unsigned long tmp;
-		int copied;
-
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		ret = -EIO;
-		if (copied != sizeof(tmp))
-			break;
-		ret = put_user(tmp,(unsigned long __user *) data);
-		break;
-	}
+	case PTRACE_PEEKDATA:
+		ret = generic_ptrace_peekdata(child, addr, data);
 
 	/* read the word at location addr in the USER area. */
 	case PTRACE_PEEKUSR: {
diff --git a/arch/sh64/kernel/ptrace.c b/arch/sh64/kernel/ptrace.c
index 4e95e18b46d9..12340e499bfb 100644
--- a/arch/sh64/kernel/ptrace.c
+++ b/arch/sh64/kernel/ptrace.c
@@ -129,17 +129,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	switch (request) {
 	/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */
-	case PTRACE_PEEKDATA: {
-		unsigned long tmp;
-		int copied;
-
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		ret = -EIO;
-		if (copied != sizeof(tmp))
-			break;
-		ret = put_user(tmp,(unsigned long *) data);
+	case PTRACE_PEEKDATA:
+		ret = generic_ptrace_peekdata(child, addr, data);
 		break;
-	}
 
 	/* read the word at location addr in the USER area. */
 	case PTRACE_PEEKUSR: {
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index 627742d89434..1966da6eb363 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -52,17 +52,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	switch (request) {
 		/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */ 
-	case PTRACE_PEEKDATA: {
-		unsigned long tmp;
-		int copied;
-
-		ret = -EIO;
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		if (copied != sizeof(tmp))
-			break;
-		ret = put_user(tmp, p);
+	case PTRACE_PEEKDATA:
+		ret = generic_ptrace_peekdata(child, addr, data);
 		break;
-	}
 
 	/* read the word at location addr in the USER area. */
         case PTRACE_PEEKUSR:
diff --git a/arch/v850/kernel/ptrace.c b/arch/v850/kernel/ptrace.c
index a9b09343097d..3bedd144e52d 100644
--- a/arch/v850/kernel/ptrace.c
+++ b/arch/v850/kernel/ptrace.c
@@ -117,15 +117,11 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	int rval;
 
 	switch (request) {
-		unsigned long val, copied;
+		unsigned long val;
 
 	case PTRACE_PEEKTEXT: /* read word at location addr. */
 	case PTRACE_PEEKDATA:
-		copied = access_process_vm(child, addr, &val, sizeof(val), 0);
-		rval = -EIO;
-		if (copied != sizeof(val))
-			break;
-		rval = put_user(val, (unsigned long *)data);
+		rval = generic_ptrace_peekdata(child, addr, data);
 		goto out;
 
 	case PTRACE_POKETEXT: /* write the word at location addr. */
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index 9409117b9f19..327ff93a38b6 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -313,17 +313,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	switch (request) {
 	/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */ 
-	case PTRACE_PEEKDATA: {
-		unsigned long tmp;
-		int copied;
-
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		ret = -EIO;
-		if (copied != sizeof(tmp))
-			break;
-		ret = put_user(tmp,(unsigned long __user *) data);
+	case PTRACE_PEEKDATA:
+		ret = generic_ptrace_peekdata(child, addr, data);
 		break;
-	}
 
 	/* read the word at location addr in the USER area. */
 	case PTRACE_PEEKUSR: {
diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c
index 14104ff63093..af182d3a7000 100644
--- a/arch/xtensa/kernel/ptrace.c
+++ b/arch/xtensa/kernel/ptrace.c
@@ -50,18 +50,8 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	switch (request) {
 	case PTRACE_PEEKTEXT: /* read word at location addr. */
 	case PTRACE_PEEKDATA:
-	{
-		unsigned long tmp;
-		int copied;
-
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		ret = -EIO;
-		if (copied != sizeof(tmp))
-			break;
-		ret = put_user(tmp,(unsigned long *) data);
-
+		ret = generic_ptrace_peekdata(child, addr, data);
 		goto out;
-	}
 
 	/* Read the word at location addr in the USER area.  */
 
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index eeb1976ef7bf..477cc8ed6bcb 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -110,6 +110,7 @@ static inline void ptrace_unlink(struct task_struct *child)
 		__ptrace_unlink(child);
 }
 
+int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data);
 
 #ifndef force_successful_syscall_return
 /*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b1d11f1c7cf7..1653d35419a1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -490,3 +490,14 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 	return ret;
 }
 #endif /* __ARCH_SYS_PTRACE */
+
+int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
+{
+	unsigned long tmp;
+	int copied;
+
+	copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0);
+	if (copied != sizeof(tmp))
+		return -EIO;
+	return put_user(tmp, (unsigned long __user *)data);
+}
-- 
cgit v1.3-8-gc7d7


From f284ce7269031947326bac6bb19a977705276222 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 17 Jul 2007 04:03:44 -0700
Subject: PTRACE_POKEDATA consolidation

Identical implementations of PTRACE_POKEDATA go into generic_ptrace_pokedata()
function.

AFAICS, fix bug on xtensa where successful PTRACE_POKEDATA will nevertheless
return EPERM.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/ptrace.c         |  4 +---
 arch/arm/kernel/ptrace.c           |  7 +------
 arch/arm26/kernel/ptrace.c         |  7 +------
 arch/avr32/kernel/ptrace.c         |  6 +-----
 arch/cris/arch-v10/kernel/ptrace.c |  7 +------
 arch/cris/arch-v32/kernel/ptrace.c |  7 +------
 arch/frv/kernel/ptrace.c           |  4 +---
 arch/h8300/kernel/ptrace.c         |  5 +----
 arch/i386/kernel/ptrace.c          |  5 +----
 arch/m32r/kernel/ptrace.c          | 12 +++---------
 arch/m68k/kernel/ptrace.c          |  3 +--
 arch/m68knommu/kernel/ptrace.c     |  5 +----
 arch/mips/kernel/ptrace.c          |  6 +-----
 arch/powerpc/kernel/ptrace.c       |  6 +-----
 arch/s390/kernel/ptrace.c          |  5 +----
 arch/sh/kernel/ptrace.c            |  5 +----
 arch/sh64/kernel/ptrace.c          |  5 +----
 arch/um/kernel/ptrace.c            |  6 +-----
 arch/v850/kernel/ptrace.c          |  6 +-----
 arch/x86_64/kernel/ptrace.c        |  5 +----
 arch/xtensa/kernel/ptrace.c        |  5 +----
 include/linux/ptrace.h             |  1 +
 kernel/ptrace.c                    |  8 ++++++++
 23 files changed, 32 insertions(+), 98 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index 0cd060598f9a..83a781842266 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -315,9 +315,7 @@ do_sys_ptrace(long request, long pid, long addr, long data,
 	/* When I and D space are separate, this will have to be fixed.  */
 	case PTRACE_POKETEXT: /* write the word at location addr. */
 	case PTRACE_POKEDATA:
-		tmp = data;
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 1);
-		ret = (copied == sizeof(tmp)) ? 0 : -EIO;
+		ret = generic_ptrace_pokedata(child, addr, data);
 		break;
 
 	case PTRACE_POKEUSR: /* write the specified register */
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 9a5d9754c2a2..78c9f1a3d41f 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -677,12 +677,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		 */
 		case PTRACE_POKETEXT:
 		case PTRACE_POKEDATA:
-			ret = access_process_vm(child, addr, &data,
-						sizeof(unsigned long), 1);
-			if (ret == sizeof(unsigned long))
-				ret = 0;
-			else
-				ret = -EIO;
+			ret = generic_ptrace_pokedata(child, addr, data);
 			break;
 
 		case PTRACE_POKEUSR:
diff --git a/arch/arm26/kernel/ptrace.c b/arch/arm26/kernel/ptrace.c
index 0d0ead0e0a74..0fefb86970c6 100644
--- a/arch/arm26/kernel/ptrace.c
+++ b/arch/arm26/kernel/ptrace.c
@@ -551,12 +551,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		 */
 		case PTRACE_POKETEXT:
 		case PTRACE_POKEDATA:
-			ret = access_process_vm(child, addr, &data,
-						sizeof(unsigned long), 1);
-			if (ret == sizeof(unsigned long))
-				ret = 0;
-			else
-				ret = -EIO;
+			ret = generic_ptrace_pokedata(child, addr, data);
 			break;
 
 		case PTRACE_POKEUSR:
diff --git a/arch/avr32/kernel/ptrace.c b/arch/avr32/kernel/ptrace.c
index 1043fdc2df7f..39060cbeb2a3 100644
--- a/arch/avr32/kernel/ptrace.c
+++ b/arch/avr32/kernel/ptrace.c
@@ -176,11 +176,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	/* Write the word in data at location addr */
 	case PTRACE_POKETEXT:
 	case PTRACE_POKEDATA:
-		ret = access_process_vm(child, addr, &data, sizeof(data), 1);
-		if (ret == sizeof(data))
-			ret = 0;
-		else
-			ret = -EIO;
+		ret = generic_ptrace_pokedata(child, addr, data);
 		break;
 
 	case PTRACE_POKEUSR:
diff --git a/arch/cris/arch-v10/kernel/ptrace.c b/arch/cris/arch-v10/kernel/ptrace.c
index 74b1b4dc8225..f4f9db698b44 100644
--- a/arch/cris/arch-v10/kernel/ptrace.c
+++ b/arch/cris/arch-v10/kernel/ptrace.c
@@ -103,12 +103,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		/* Write the word at location address. */
 		case PTRACE_POKETEXT:
 		case PTRACE_POKEDATA:
-			ret = 0;
-			
-			if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data))
-				break;
-			
-			ret = -EIO;
+			ret = generic_ptrace_pokedata(child, addr, data);
 			break;
  
  		/* Write the word at location address in the USER area. */
diff --git a/arch/cris/arch-v32/kernel/ptrace.c b/arch/cris/arch-v32/kernel/ptrace.c
index d4d57b741334..38ece0cd47cb 100644
--- a/arch/cris/arch-v32/kernel/ptrace.c
+++ b/arch/cris/arch-v32/kernel/ptrace.c
@@ -146,12 +146,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		/* Write the word at location address. */
 		case PTRACE_POKETEXT:
 		case PTRACE_POKEDATA:
-			ret = 0;
-
-			if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data))
-				break;
-
-			ret = -EIO;
+			ret = generic_ptrace_pokedata(child, addr, data);
 			break;
 
  		/* Write the word at location address in the USER area. */
diff --git a/arch/frv/kernel/ptrace.c b/arch/frv/kernel/ptrace.c
index a10f3092fad4..709e9bdc6126 100644
--- a/arch/frv/kernel/ptrace.c
+++ b/arch/frv/kernel/ptrace.c
@@ -168,9 +168,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		ret = -EIO;
 		if (is_user_addr_valid(child, addr, sizeof(tmp)) < 0)
 			break;
-		if (access_process_vm(child, addr, &data, sizeof(data), 1) != sizeof(data))
-			break;
-		ret = 0;
+		ret = generic_ptrace_pokedata(child, addr, data);
 		break;
 
 	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
diff --git a/arch/h8300/kernel/ptrace.c b/arch/h8300/kernel/ptrace.c
index 8a7a991b8f76..d32bbf02fc48 100644
--- a/arch/h8300/kernel/ptrace.c
+++ b/arch/h8300/kernel/ptrace.c
@@ -111,10 +111,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
       /* when I and D space are separate, this will have to be fixed. */
 		case PTRACE_POKETEXT: /* write the word at location addr. */
 		case PTRACE_POKEDATA:
-			ret = 0;
-			if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data))
-				break;
-			ret = -EIO;
+			ret = generic_ptrace_pokedata(child, addr, data);
 			break;
 
 		case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index f4bcf1da662a..1c075f58d1f9 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -387,10 +387,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	/* when I and D space are separate, this will have to be fixed. */
 	case PTRACE_POKETEXT: /* write the word at location addr. */
 	case PTRACE_POKEDATA:
-		ret = 0;
-		if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data))
-			break;
-		ret = -EIO;
+		ret = generic_ptrace_pokedata(child, addr, data);
 		break;
 
 	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c
index 01a1c9ac8458..57a92ef31a90 100644
--- a/arch/m32r/kernel/ptrace.c
+++ b/arch/m32r/kernel/ptrace.c
@@ -619,15 +619,9 @@ do_ptrace(long request, struct task_struct *child, long addr, long data)
 	 */
 	case PTRACE_POKETEXT:
 	case PTRACE_POKEDATA:
-		ret = access_process_vm(child, addr, &data, sizeof(data), 1);
-		if (ret == sizeof(data)) {
-			ret = 0;
-			if (request == PTRACE_POKETEXT) {
-				invalidate_cache();
-			}
-		} else {
-			ret = -EIO;
-		}
+		ret = generic_ptrace_pokedata(child, addr, data);
+		if (ret == 0 && request == PTRACE_POKETEXT)
+			invalidate_cache();
 		break;
 
 	/*
diff --git a/arch/m68k/kernel/ptrace.c b/arch/m68k/kernel/ptrace.c
index 01a3a09c53d9..2cf0690b7882 100644
--- a/arch/m68k/kernel/ptrace.c
+++ b/arch/m68k/kernel/ptrace.c
@@ -157,8 +157,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	/* when I and D space are separate, this will have to be fixed. */
 	case PTRACE_POKETEXT:	/* write the word at location addr. */
 	case PTRACE_POKEDATA:
-		if (access_process_vm(child, addr, &data, sizeof(data), 1) != sizeof(data))
-			goto out_eio;
+		ret = generic_ptrace_pokedata(child, addr, data);
 		break;
 
 	case PTRACE_POKEUSR:	/* write the word at location addr in the USER area */
diff --git a/arch/m68knommu/kernel/ptrace.c b/arch/m68knommu/kernel/ptrace.c
index f550e614aa78..ef70ca070ce2 100644
--- a/arch/m68knommu/kernel/ptrace.c
+++ b/arch/m68knommu/kernel/ptrace.c
@@ -151,10 +151,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		/* when I and D space are separate, this will have to be fixed. */
 		case PTRACE_POKETEXT: /* write the word at location addr. */
 		case PTRACE_POKEDATA:
-			ret = 0;
-			if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data))
-				break;
-			ret = -EIO;
+			ret = generic_ptrace_pokedata(child, addr, data);
 			break;
 
 		case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index af9d0bec8731..893e7bccf226 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -305,11 +305,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	/* when I and D space are separate, this will have to be fixed. */
 	case PTRACE_POKETEXT: /* write the word at location addr. */
 	case PTRACE_POKEDATA:
-		ret = 0;
-		if (access_process_vm(child, addr, &data, sizeof(data), 1)
-		    == sizeof(data))
-			break;
-		ret = -EIO;
+		ret = generic_ptrace_pokedata(child, addr, data);
 		break;
 
 	case PTRACE_POKEUSR: {
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 581d427148e7..8a177bd9eab4 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -413,11 +413,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	/* If I and D space are separate, this will have to be fixed. */
 	case PTRACE_POKETEXT: /* write the word at location addr. */
 	case PTRACE_POKEDATA:
-		ret = 0;
-		if (access_process_vm(child, addr, &data, sizeof(data), 1)
-				== sizeof(data))
-			break;
-		ret = -EIO;
+		ret = generic_ptrace_pokedata(child, addr, data);
 		break;
 
 	/* write the word at location addr in the USER area */
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 28afff4e5d1b..f4503ca27630 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -314,10 +314,7 @@ do_ptrace_normal(struct task_struct *child, long request, long addr, long data)
 		/* Remove high order bit from address (only for 31 bit). */
 		addr &= PSW_ADDR_INSN;
 		/* write the word at location addr. */
-		copied = access_process_vm(child, addr, &data, sizeof(data),1);
-		if (copied != sizeof(data))
-			return -EIO;
-		return 0;
+		return generic_ptrace_pokedata(child, addr, data);
 
 	case PTRACE_POKEUSR:
 		/* write the word at location addr in the USER area */
diff --git a/arch/sh/kernel/ptrace.c b/arch/sh/kernel/ptrace.c
index f23f949576a5..891d1d46c902 100644
--- a/arch/sh/kernel/ptrace.c
+++ b/arch/sh/kernel/ptrace.c
@@ -126,10 +126,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	/* when I and D space are separate, this will have to be fixed. */
 	case PTRACE_POKETEXT: /* write the word at location addr. */
 	case PTRACE_POKEDATA:
-		ret = 0;
-		if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data))
-			break;
-		ret = -EIO;
+		ret = generic_ptrace_pokedata(child, addr, data);
 		break;
 
 	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
diff --git a/arch/sh64/kernel/ptrace.c b/arch/sh64/kernel/ptrace.c
index 12340e499bfb..df06c6477468 100644
--- a/arch/sh64/kernel/ptrace.c
+++ b/arch/sh64/kernel/ptrace.c
@@ -158,10 +158,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	/* when I and D space are separate, this will have to be fixed. */
 	case PTRACE_POKETEXT: /* write the word at location addr. */
 	case PTRACE_POKEDATA:
-		ret = 0;
-		if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data))
-			break;
-		ret = -EIO;
+		ret = generic_ptrace_pokedata(child, addr, data);
 		break;
 
 	case PTRACE_POKEUSR:
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index 1966da6eb363..6916c8888dba 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -64,11 +64,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	/* when I and D space are separate, this will have to be fixed. */
 	case PTRACE_POKETEXT: /* write the word at location addr. */
 	case PTRACE_POKEDATA:
-		ret = -EIO;
-		if (access_process_vm(child, addr, &data, sizeof(data), 
-				      1) != sizeof(data))
-			break;
-		ret = 0;
+		ret = generic_ptrace_pokedata(child, addr, data);
 		break;
 
 	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
diff --git a/arch/v850/kernel/ptrace.c b/arch/v850/kernel/ptrace.c
index 3bedd144e52d..a458ac941b25 100644
--- a/arch/v850/kernel/ptrace.c
+++ b/arch/v850/kernel/ptrace.c
@@ -126,11 +126,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 
 	case PTRACE_POKETEXT: /* write the word at location addr. */
 	case PTRACE_POKEDATA:
-		rval = 0;
-		if (access_process_vm(child, addr, &data, sizeof(data), 1)
-		    == sizeof(data))
-			break;
-		rval = -EIO;
+		rval = generic_ptrace_pokedata(child, addr, data);
 		goto out;
 
 	/* Read/write the word at location ADDR in the registers.  */
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index 327ff93a38b6..fa6775ef729f 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -359,10 +359,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	/* when I and D space are separate, this will have to be fixed. */
 	case PTRACE_POKETEXT: /* write the word at location addr. */
 	case PTRACE_POKEDATA:
-		ret = 0;
-		if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data))
-			break;
-		ret = -EIO;
+		ret = generic_ptrace_pokedata(child, addr, data);
 		break;
 
 	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c
index af182d3a7000..06a13d9b69db 100644
--- a/arch/xtensa/kernel/ptrace.c
+++ b/arch/xtensa/kernel/ptrace.c
@@ -128,10 +128,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 
 	case PTRACE_POKETEXT: /* write the word at location addr. */
 	case PTRACE_POKEDATA:
-		if (access_process_vm(child, addr, &data, sizeof(data), 1)
-		    == sizeof(data))
-			break;
-		ret = -EIO;
+		ret = generic_ptrace_pokedata(child, addr, data);
 		goto out;
 
 	case PTRACE_POKEUSR:
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 477cc8ed6bcb..ae8146abd746 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -111,6 +111,7 @@ static inline void ptrace_unlink(struct task_struct *child)
 }
 
 int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data);
+int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data);
 
 #ifndef force_successful_syscall_return
 /*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1653d35419a1..4a1745f1dadf 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -501,3 +501,11 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
 		return -EIO;
 	return put_user(tmp, (unsigned long __user *)data);
 }
+
+int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
+{
+	int copied;
+
+	copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
+	return (copied == sizeof(data)) ? 0 : -EIO;
+}
-- 
cgit v1.3-8-gc7d7


From 62239ac2b301abc397e70986649666cfb7835907 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 17 Jul 2007 04:03:45 -0700
Subject: proper prototype for proc_nr_files()

Add a proper prototype for proc_nr_files() in include/linux/fs.h

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 5 +++++
 kernel/sysctl.c    | 4 +---
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index e68780810279..b3a9f0db9d80 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -283,6 +283,7 @@ extern int dir_notify_enable;
 #include <linux/init.h>
 #include <linux/pid.h>
 #include <linux/mutex.h>
+#include <linux/sysctl.h>
 
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
@@ -2050,5 +2051,9 @@ static inline void free_secdata(void *secdata)
 { }
 #endif	/* CONFIG_SECURITY */
 
+int proc_nr_files(ctl_table *table, int write, struct file *filp,
+		  void __user *buffer, size_t *lenp, loff_t *ppos);
+
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_FS_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 48dae075d5c2..7063ebc6db05 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -29,6 +29,7 @@
 #include <linux/utsname.h>
 #include <linux/capability.h>
 #include <linux/smp_lock.h>
+#include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/kobject.h>
@@ -49,9 +50,6 @@
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 
-extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
-                     void __user *buffer, size_t *lenp, loff_t *ppos);
-
 #ifdef CONFIG_X86
 #include <asm/nmi.h>
 #include <asm/stacktrace.h>
-- 
cgit v1.3-8-gc7d7


From f4480240f700587c15507b7815e75989b16825b2 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 17 Jul 2007 04:03:47 -0700
Subject: unregister_blkdev(): return void

Put WARN_ON and fixed all callers of unregister_blkdev().  Now we can make
unregister_blkdev return void.

Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/genhd.c      | 7 +------
 include/linux/fs.h | 2 +-
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 7f71b2c472b8..3af1e7a378d4 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -108,13 +108,11 @@ out:
 
 EXPORT_SYMBOL(register_blkdev);
 
-/* todo: make void - error printk here */
-int unregister_blkdev(unsigned int major, const char *name)
+void unregister_blkdev(unsigned int major, const char *name)
 {
 	struct blk_major_name **n;
 	struct blk_major_name *p = NULL;
 	int index = major_to_index(major);
-	int ret = 0;
 
 	mutex_lock(&block_subsys_lock);
 	for (n = &major_names[index]; *n; n = &(*n)->next)
@@ -122,15 +120,12 @@ int unregister_blkdev(unsigned int major, const char *name)
 			break;
 	if (!*n || strcmp((*n)->name, name)) {
 		WARN_ON(1);
-		ret = -EINVAL;
 	} else {
 		p = *n;
 		*n = p->next;
 	}
 	mutex_unlock(&block_subsys_lock);
 	kfree(p);
-
-	return ret;
 }
 
 EXPORT_SYMBOL(unregister_blkdev);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b3a9f0db9d80..aa74f7de9dcd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1527,7 +1527,7 @@ extern void putname(const char *name);
 
 #ifdef CONFIG_BLOCK
 extern int register_blkdev(unsigned int, const char *);
-extern int unregister_blkdev(unsigned int, const char *);
+extern void unregister_blkdev(unsigned int, const char *);
 extern struct block_device *bdget(dev_t);
 extern void bd_set_size(struct block_device *, loff_t size);
 extern void bd_forget(struct inode *inode);
-- 
cgit v1.3-8-gc7d7


From 77293034696e3e0b6c8b8fc1f96be091104b3d2b Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Tue, 17 Jul 2007 04:03:49 -0700
Subject: Remove OPEN_MAX

The OPEN_MAX macro in limits.h should not be there.  It claims to be the
limit on file descriptors in a process, but its value is wrong for that.
There is no constant value, but a variable resource limit (RLIMIT_NOFILE).
Nothing in the kernel uses OPEN_MAX except things that are wrong to do so.
I've submitted other patches to remove those uses.

The proper thing to do according to POSIX is not to define OPEN_MAX at all.
The sysconf (_SC_OPEN_MAX) implementation works by calling getrlimit.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/limits.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/limits.h b/include/linux/limits.h
index eaf2e099f125..c4b4e579c01d 100644
--- a/include/linux/limits.h
+++ b/include/linux/limits.h
@@ -6,7 +6,6 @@
 #define NGROUPS_MAX    65536	/* supplemental group IDs are available */
 #define ARG_MAX       131072	/* # bytes of args + environ for exec() */
 #define CHILD_MAX        999    /* no limit :-) */
-#define OPEN_MAX         256	/* # open files a process may have */
 #define LINK_MAX         127	/* # links a file may have */
 #define MAX_CANON        255	/* size of the canonical input queue */
 #define MAX_INPUT        255	/* size of the type-ahead buffer */
-- 
cgit v1.3-8-gc7d7


From f9e86f419073605b4520848021cc042963c227c7 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Tue, 17 Jul 2007 04:03:49 -0700
Subject: Remove CHILD_MAX

The CHILD_MAX macro in limits.h should not be there.  It claims to be the
limit on processes a user can own, but its value is wrong for that.
There is no constant value, but a variable resource limit (RLIMIT_NPROC).
Nothing in the kernel uses CHILD_MAX.

The proper thing to do according to POSIX is not to define CHILD_MAX at all.
The sysconf (_SC_CHILD_MAX) implementation works by calling getrlimit.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/limits.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/limits.h b/include/linux/limits.h
index c4b4e579c01d..2d0f94162fb3 100644
--- a/include/linux/limits.h
+++ b/include/linux/limits.h
@@ -5,7 +5,6 @@
 
 #define NGROUPS_MAX    65536	/* supplemental group IDs are available */
 #define ARG_MAX       131072	/* # bytes of args + environ for exec() */
-#define CHILD_MAX        999    /* no limit :-) */
 #define LINK_MAX         127	/* # links a file may have */
 #define MAX_CANON        255	/* size of the canonical input queue */
 #define MAX_INPUT        255	/* size of the type-ahead buffer */
-- 
cgit v1.3-8-gc7d7


From b45d52797432bd6b5d9786dbda940eb8d0b9ed06 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Tue, 17 Jul 2007 04:03:50 -0700
Subject: sb1250-duart.c: SB1250 DUART serial support

This is a driver for the SB1250 DUART, a dual serial port implementation
included in the Broadcom family of SOCs descending from the SiByte SB1250
MIPS64 chip multiprocessor.  It is a new implementation replacing the
old-fashioned driver currently present in the linux-mips.org tree.  It
supports all the usual features one would expect from a(n asynchronous)
serial driver, including modem line control (as far as hardware supports it
-- there is edge detection logic missing from the DCD and RI lines and the
driver does not implement polling of these lines at the moment), the serial
console, BREAK transmission and reception, including the magic SysRq.  The
receive FIFO threshold is not maintained though.

The driver was tested with a SWARM board which uses a BCM1250 SOC (which is
dual MIPS64 CMP) and has both ports of the single DUART implemented wired
externally.  Both were tested.  Testing included using the ports as
terminal lines at 1200bps (which is the ports minimum), 115200bps and a
couple of random speeds inbetween.  The modem lines were verified to
operate correctly.  No testing was performed with a use as a network
interface, like with SLIP or PPP.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/sibyte/bcm1480/setup.c       |   1 +
 arch/mips/sibyte/sb1250/setup.c        |   1 +
 drivers/serial/Kconfig                 |  28 +
 drivers/serial/Makefile                |   1 +
 drivers/serial/sb1250-duart.c          | 972 +++++++++++++++++++++++++++++++++
 include/asm-mips/sibyte/bcm1480_regs.h |  30 +-
 include/asm-mips/sibyte/sb1250_regs.h  |  76 +--
 include/asm-mips/sibyte/sb1250_uart.h  |   7 +-
 include/linux/serial_core.h            |   3 +
 9 files changed, 1074 insertions(+), 45 deletions(-)
 create mode 100644 drivers/serial/sb1250-duart.c

(limited to 'include/linux')

diff --git a/arch/mips/sibyte/bcm1480/setup.c b/arch/mips/sibyte/bcm1480/setup.c
index bdaac34ae708..89f29233cae1 100644
--- a/arch/mips/sibyte/bcm1480/setup.c
+++ b/arch/mips/sibyte/bcm1480/setup.c
@@ -31,6 +31,7 @@
 unsigned int sb1_pass;
 unsigned int soc_pass;
 unsigned int soc_type;
+EXPORT_SYMBOL(soc_type);
 unsigned int periph_rev;
 unsigned int zbbus_mhz;
 
diff --git a/arch/mips/sibyte/sb1250/setup.c b/arch/mips/sibyte/sb1250/setup.c
index f4a6169aa0a4..2d5c6d8b41f2 100644
--- a/arch/mips/sibyte/sb1250/setup.c
+++ b/arch/mips/sibyte/sb1250/setup.c
@@ -31,6 +31,7 @@
 unsigned int sb1_pass;
 unsigned int soc_pass;
 unsigned int soc_type;
+EXPORT_SYMBOL(soc_type);
 unsigned int periph_rev;
 unsigned int zbbus_mhz;
 EXPORT_SYMBOL(zbbus_mhz);
diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index cab42cbd920d..7fa413ddccf5 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -338,6 +338,34 @@ config SERIAL_AMBA_PL011_CONSOLE
 	  your boot loader (lilo or loadlin) about how to pass options to the
 	  kernel at boot time.)
 
+config SERIAL_SB1250_DUART
+	tristate "BCM1xxx on-chip DUART serial support"
+	depends on SIBYTE_SB1xxx_SOC=y
+	select SERIAL_CORE
+	default y
+	---help---
+	  Support for the asynchronous serial interface (DUART) included in
+	  the BCM1250 and derived System-On-a-Chip (SOC) devices.  Note that
+	  the letter D in DUART stands for "dual", which is how the device
+	  is implemented.  Depending on the SOC configuration there may be
+	  one or more DUARTs available of which all are handled.
+
+	  If unsure, say Y.  To compile this driver as a module, choose M here:
+	  the module will be called sb1250-duart.
+
+config SERIAL_SB1250_DUART_CONSOLE
+	bool "Support for console on a BCM1xxx DUART serial port"
+	depends on SERIAL_SB1250_DUART=y
+	select SERIAL_CORE_CONSOLE
+	default y
+	---help---
+	  If you say Y here, it will be possible to use a serial port as the
+	  system console (the system console is the device which receives all
+	  kernel messages and warnings and which allows logins in single user
+	  mode).
+
+	  If unsure, say Y.
+
 config SERIAL_ATMEL
 	bool "AT91 / AT32 on-chip serial port support"
 	depends on (ARM && ARCH_AT91) || AVR32
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index 08ad0d978183..c48cdd61b736 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_SERIAL_MPC52xx) += mpc52xx_uart.o
 obj-$(CONFIG_SERIAL_ICOM) += icom.o
 obj-$(CONFIG_SERIAL_M32R_SIO) += m32r_sio.o
 obj-$(CONFIG_SERIAL_MPSC) += mpsc.o
+obj-$(CONFIG_SERIAL_SB1250_DUART) += sb1250-duart.o
 obj-$(CONFIG_ETRAX_SERIAL) += crisv10.o
 obj-$(CONFIG_SERIAL_JSM) += jsm/
 obj-$(CONFIG_SERIAL_TXX9) += serial_txx9.o
diff --git a/drivers/serial/sb1250-duart.c b/drivers/serial/sb1250-duart.c
new file mode 100644
index 000000000000..1d9d7285172a
--- /dev/null
+++ b/drivers/serial/sb1250-duart.c
@@ -0,0 +1,972 @@
+/*
+ *	drivers/serial/sb1250-duart.c
+ *
+ *	Support for the asynchronous serial interface (DUART) included
+ *	in the BCM1250 and derived System-On-a-Chip (SOC) devices.
+ *
+ *	Copyright (c) 2007  Maciej W. Rozycki
+ *
+ *	Derived from drivers/char/sb1250_duart.c for which the following
+ *	copyright applies:
+ *
+ *	Copyright (c) 2000, 2001, 2002, 2003, 2004  Broadcom Corporation
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	References:
+ *
+ *	"BCM1250/BCM1125/BCM1125H User Manual", Broadcom Corporation
+ */
+
+#if defined(CONFIG_SERIAL_SB1250_DUART_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
+#define SUPPORT_SYSRQ
+#endif
+
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/serial.h>
+#include <linux/serial_core.h>
+#include <linux/spinlock.h>
+#include <linux/sysrq.h>
+#include <linux/tty.h>
+#include <linux/types.h>
+
+#include <asm/atomic.h>
+#include <asm/io.h>
+#include <asm/war.h>
+
+#include <asm/sibyte/sb1250.h>
+#include <asm/sibyte/sb1250_uart.h>
+#include <asm/sibyte/swarm.h>
+
+
+#if defined(CONFIG_SIBYTE_BCM1x55) || defined(CONFIG_SIBYTE_BCM1x80)
+#include <asm/sibyte/bcm1480_regs.h>
+#include <asm/sibyte/bcm1480_int.h>
+
+#define SBD_CHANREGS(line)	A_BCM1480_DUART_CHANREG((line), 0)
+#define SBD_CTRLREGS(line)	A_BCM1480_DUART_CTRLREG((line), 0)
+#define SBD_INT(line)		(K_BCM1480_INT_UART_0 + (line))
+
+#elif defined(CONFIG_SIBYTE_SB1250) || defined(CONFIG_SIBYTE_BCM112X)
+#include <asm/sibyte/sb1250_regs.h>
+#include <asm/sibyte/sb1250_int.h>
+
+#define SBD_CHANREGS(line)	A_DUART_CHANREG((line), 0)
+#define SBD_CTRLREGS(line)	A_DUART_CTRLREG(0)
+#define SBD_INT(line)		(K_INT_UART_0 + (line))
+
+#else
+#error invalid SB1250 UART configuration
+
+#endif
+
+
+MODULE_AUTHOR("Maciej W. Rozycki <macro@linux-mips.org>");
+MODULE_DESCRIPTION("BCM1xxx on-chip DUART serial driver");
+MODULE_LICENSE("GPL");
+
+
+#define DUART_MAX_CHIP 2
+#define DUART_MAX_SIDE 2
+
+/*
+ * Per-port state.
+ */
+struct sbd_port {
+	struct sbd_duart	*duart;
+	struct uart_port	port;
+	unsigned char __iomem	*memctrl;
+	int			tx_stopped;
+	int			initialised;
+};
+
+/*
+ * Per-DUART state for the shared register space.
+ */
+struct sbd_duart {
+	struct sbd_port		sport[2];
+	unsigned long		mapctrl;
+	atomic_t		map_guard;
+};
+
+#define to_sport(uport) container_of(uport, struct sbd_port, port)
+
+static struct sbd_duart sbd_duarts[DUART_MAX_CHIP];
+
+#define __unused __attribute__((__unused__))
+
+
+/*
+ * Reading and writing SB1250 DUART registers.
+ *
+ * There are three register spaces: two per-channel ones and
+ * a shared one.  We have to define accessors appropriately.
+ * All registers are 64-bit and all but the Baud Rate Clock
+ * registers only define 8 least significant bits.  There is
+ * also a workaround to take into account.  Raw accessors use
+ * the full register width, but cooked ones truncate it
+ * intentionally so that the rest of the driver does not care.
+ */
+static u64 __read_sbdchn(struct sbd_port *sport, int reg)
+{
+	void __iomem *csr = sport->port.membase + reg;
+
+	return __raw_readq(csr);
+}
+
+static u64 __read_sbdshr(struct sbd_port *sport, int reg)
+{
+	void __iomem *csr = sport->memctrl + reg;
+
+	return __raw_readq(csr);
+}
+
+static void __write_sbdchn(struct sbd_port *sport, int reg, u64 value)
+{
+	void __iomem *csr = sport->port.membase + reg;
+
+	__raw_writeq(value, csr);
+}
+
+static void __write_sbdshr(struct sbd_port *sport, int reg, u64 value)
+{
+	void __iomem *csr = sport->memctrl + reg;
+
+	__raw_writeq(value, csr);
+}
+
+/*
+ * In bug 1956, we get glitches that can mess up uart registers.  This
+ * "read-mode-reg after any register access" is an accepted workaround.
+ */
+static void __war_sbd1956(struct sbd_port *sport)
+{
+	__read_sbdchn(sport, R_DUART_MODE_REG_1);
+	__read_sbdchn(sport, R_DUART_MODE_REG_2);
+}
+
+static unsigned char read_sbdchn(struct sbd_port *sport, int reg)
+{
+	unsigned char retval;
+
+	retval = __read_sbdchn(sport, reg);
+	if (SIBYTE_1956_WAR)
+		__war_sbd1956(sport);
+	return retval;
+}
+
+static unsigned char read_sbdshr(struct sbd_port *sport, int reg)
+{
+	unsigned char retval;
+
+	retval = __read_sbdshr(sport, reg);
+	if (SIBYTE_1956_WAR)
+		__war_sbd1956(sport);
+	return retval;
+}
+
+static void write_sbdchn(struct sbd_port *sport, int reg, unsigned int value)
+{
+	__write_sbdchn(sport, reg, value);
+	if (SIBYTE_1956_WAR)
+		__war_sbd1956(sport);
+}
+
+static void write_sbdshr(struct sbd_port *sport, int reg, unsigned int value)
+{
+	__write_sbdshr(sport, reg, value);
+	if (SIBYTE_1956_WAR)
+		__war_sbd1956(sport);
+}
+
+
+static int sbd_receive_ready(struct sbd_port *sport)
+{
+	return read_sbdchn(sport, R_DUART_STATUS) & M_DUART_RX_RDY;
+}
+
+static int sbd_receive_drain(struct sbd_port *sport)
+{
+	int loops = 10000;
+
+	while (sbd_receive_ready(sport) && loops--)
+		read_sbdchn(sport, R_DUART_RX_HOLD);
+	return loops;
+}
+
+static int __unused sbd_transmit_ready(struct sbd_port *sport)
+{
+	return read_sbdchn(sport, R_DUART_STATUS) & M_DUART_TX_RDY;
+}
+
+static int __unused sbd_transmit_drain(struct sbd_port *sport)
+{
+	int loops = 10000;
+
+	while (!sbd_transmit_ready(sport) && loops--)
+		udelay(2);
+	return loops;
+}
+
+static int sbd_transmit_empty(struct sbd_port *sport)
+{
+	return read_sbdchn(sport, R_DUART_STATUS) & M_DUART_TX_EMT;
+}
+
+static int sbd_line_drain(struct sbd_port *sport)
+{
+	int loops = 10000;
+
+	while (!sbd_transmit_empty(sport) && loops--)
+		udelay(2);
+	return loops;
+}
+
+
+static unsigned int sbd_tx_empty(struct uart_port *uport)
+{
+	struct sbd_port *sport = to_sport(uport);
+
+	return sbd_transmit_empty(sport) ? TIOCSER_TEMT : 0;
+}
+
+static unsigned int sbd_get_mctrl(struct uart_port *uport)
+{
+	struct sbd_port *sport = to_sport(uport);
+	unsigned int mctrl, status;
+
+	status = read_sbdshr(sport, R_DUART_IN_PORT);
+	status >>= (uport->line) % 2;
+	mctrl = (!(status & M_DUART_IN_PIN0_VAL) ? TIOCM_CTS : 0) |
+		(!(status & M_DUART_IN_PIN4_VAL) ? TIOCM_CAR : 0) |
+		(!(status & M_DUART_RIN0_PIN) ? TIOCM_RNG : 0) |
+		(!(status & M_DUART_IN_PIN2_VAL) ? TIOCM_DSR : 0);
+	return mctrl;
+}
+
+static void sbd_set_mctrl(struct uart_port *uport, unsigned int mctrl)
+{
+	struct sbd_port *sport = to_sport(uport);
+	unsigned int clr = 0, set = 0, mode2;
+
+	if (mctrl & TIOCM_DTR)
+		set |= M_DUART_SET_OPR2;
+	else
+		clr |= M_DUART_CLR_OPR2;
+	if (mctrl & TIOCM_RTS)
+		set |= M_DUART_SET_OPR0;
+	else
+		clr |= M_DUART_CLR_OPR0;
+	clr <<= (uport->line) % 2;
+	set <<= (uport->line) % 2;
+
+	mode2 = read_sbdchn(sport, R_DUART_MODE_REG_2);
+	mode2 &= ~M_DUART_CHAN_MODE;
+	if (mctrl & TIOCM_LOOP)
+		mode2 |= V_DUART_CHAN_MODE_LCL_LOOP;
+	else
+		mode2 |= V_DUART_CHAN_MODE_NORMAL;
+
+	write_sbdshr(sport, R_DUART_CLEAR_OPR, clr);
+	write_sbdshr(sport, R_DUART_SET_OPR, set);
+	write_sbdchn(sport, R_DUART_MODE_REG_2, mode2);
+}
+
+static void sbd_stop_tx(struct uart_port *uport)
+{
+	struct sbd_port *sport = to_sport(uport);
+
+	write_sbdchn(sport, R_DUART_CMD, M_DUART_TX_DIS);
+	sport->tx_stopped = 1;
+};
+
+static void sbd_start_tx(struct uart_port *uport)
+{
+	struct sbd_port *sport = to_sport(uport);
+	unsigned int mask;
+
+	/* Enable tx interrupts.  */
+	mask = read_sbdshr(sport, R_DUART_IMRREG((uport->line) % 2));
+	mask |= M_DUART_IMR_TX;
+	write_sbdshr(sport, R_DUART_IMRREG((uport->line) % 2), mask);
+
+	/* Go!, go!, go!...  */
+	write_sbdchn(sport, R_DUART_CMD, M_DUART_TX_EN);
+	sport->tx_stopped = 0;
+};
+
+static void sbd_stop_rx(struct uart_port *uport)
+{
+	struct sbd_port *sport = to_sport(uport);
+
+	write_sbdshr(sport, R_DUART_IMRREG((uport->line) % 2), 0);
+};
+
+static void sbd_enable_ms(struct uart_port *uport)
+{
+	struct sbd_port *sport = to_sport(uport);
+
+	write_sbdchn(sport, R_DUART_AUXCTL_X,
+		     M_DUART_CIN_CHNG_ENA | M_DUART_CTS_CHNG_ENA);
+}
+
+static void sbd_break_ctl(struct uart_port *uport, int break_state)
+{
+	struct sbd_port *sport = to_sport(uport);
+
+	if (break_state == -1)
+		write_sbdchn(sport, R_DUART_CMD, V_DUART_MISC_CMD_START_BREAK);
+	else
+		write_sbdchn(sport, R_DUART_CMD, V_DUART_MISC_CMD_STOP_BREAK);
+}
+
+
+static void sbd_receive_chars(struct sbd_port *sport)
+{
+	struct uart_port *uport = &sport->port;
+	struct uart_icount *icount;
+	unsigned int status, ch, flag;
+	int count;
+
+	for (count = 16; count; count--) {
+		status = read_sbdchn(sport, R_DUART_STATUS);
+		if (!(status & M_DUART_RX_RDY))
+			break;
+
+		ch = read_sbdchn(sport, R_DUART_RX_HOLD);
+
+		flag = TTY_NORMAL;
+
+		icount = &uport->icount;
+		icount->rx++;
+
+		if (unlikely(status &
+			     (M_DUART_RCVD_BRK | M_DUART_FRM_ERR |
+			      M_DUART_PARITY_ERR | M_DUART_OVRUN_ERR))) {
+			if (status & M_DUART_RCVD_BRK) {
+				icount->brk++;
+				if (uart_handle_break(uport))
+					continue;
+			} else if (status & M_DUART_FRM_ERR)
+				icount->frame++;
+			else if (status & M_DUART_PARITY_ERR)
+				icount->parity++;
+			if (status & M_DUART_OVRUN_ERR)
+				icount->overrun++;
+
+			status &= uport->read_status_mask;
+			if (status & M_DUART_RCVD_BRK)
+				flag = TTY_BREAK;
+			else if (status & M_DUART_FRM_ERR)
+				flag = TTY_FRAME;
+			else if (status & M_DUART_PARITY_ERR)
+				flag = TTY_PARITY;
+		}
+
+		if (uart_handle_sysrq_char(uport, ch))
+			continue;
+
+		uart_insert_char(uport, status, M_DUART_OVRUN_ERR, ch, flag);
+	}
+
+	tty_flip_buffer_push(uport->info->tty);
+}
+
+static void sbd_transmit_chars(struct sbd_port *sport)
+{
+	struct uart_port *uport = &sport->port;
+	struct circ_buf *xmit = &sport->port.info->xmit;
+	unsigned int mask;
+	int stop_tx;
+
+	/* XON/XOFF chars.  */
+	if (sport->port.x_char) {
+		write_sbdchn(sport, R_DUART_TX_HOLD, sport->port.x_char);
+		sport->port.icount.tx++;
+		sport->port.x_char = 0;
+		return;
+	}
+
+	/* If nothing to do or stopped or hardware stopped.  */
+	stop_tx = (uart_circ_empty(xmit) || uart_tx_stopped(&sport->port));
+
+	/* Send char.  */
+	if (!stop_tx) {
+		write_sbdchn(sport, R_DUART_TX_HOLD, xmit->buf[xmit->tail]);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		sport->port.icount.tx++;
+
+		if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+			uart_write_wakeup(&sport->port);
+	}
+
+	/* Are we are done?  */
+	if (stop_tx || uart_circ_empty(xmit)) {
+		/* Disable tx interrupts.  */
+		mask = read_sbdshr(sport, R_DUART_IMRREG((uport->line) % 2));
+		mask &= ~M_DUART_IMR_TX;
+		write_sbdshr(sport, R_DUART_IMRREG((uport->line) % 2), mask);
+	}
+}
+
+static void sbd_status_handle(struct sbd_port *sport)
+{
+	struct uart_port *uport = &sport->port;
+	unsigned int delta;
+
+	delta = read_sbdshr(sport, R_DUART_INCHREG((uport->line) % 2));
+	delta >>= (uport->line) % 2;
+
+	if (delta & (M_DUART_IN_PIN0_VAL << S_DUART_IN_PIN_CHNG))
+		uart_handle_cts_change(uport, !(delta & M_DUART_IN_PIN0_VAL));
+
+	if (delta & (M_DUART_IN_PIN2_VAL << S_DUART_IN_PIN_CHNG))
+		uport->icount.dsr++;
+
+	if (delta & ((M_DUART_IN_PIN2_VAL | M_DUART_IN_PIN0_VAL) <<
+		     S_DUART_IN_PIN_CHNG))
+		wake_up_interruptible(&uport->info->delta_msr_wait);
+}
+
+static irqreturn_t sbd_interrupt(int irq, void *dev_id)
+{
+	struct sbd_port *sport = dev_id;
+	struct uart_port *uport = &sport->port;
+	irqreturn_t status = IRQ_NONE;
+	unsigned int intstat;
+	int count;
+
+	for (count = 16; count; count--) {
+		intstat = read_sbdshr(sport,
+				      R_DUART_ISRREG((uport->line) % 2));
+		intstat &= read_sbdshr(sport,
+				       R_DUART_IMRREG((uport->line) % 2));
+		intstat &= M_DUART_ISR_ALL;
+		if (!intstat)
+			break;
+
+		if (intstat & M_DUART_ISR_RX)
+			sbd_receive_chars(sport);
+		if (intstat & M_DUART_ISR_IN)
+			sbd_status_handle(sport);
+		if (intstat & M_DUART_ISR_TX)
+			sbd_transmit_chars(sport);
+
+		status = IRQ_HANDLED;
+	}
+
+	return status;
+}
+
+
+static int sbd_startup(struct uart_port *uport)
+{
+	struct sbd_port *sport = to_sport(uport);
+	unsigned int mode1;
+	int ret;
+
+	ret = request_irq(sport->port.irq, sbd_interrupt,
+			  IRQF_SHARED, "sb1250-duart", sport);
+	if (ret)
+		return ret;
+
+	/* Clear the receive FIFO.  */
+	sbd_receive_drain(sport);
+
+	/* Clear the interrupt registers.  */
+	write_sbdchn(sport, R_DUART_CMD, V_DUART_MISC_CMD_RESET_BREAK_INT);
+	read_sbdshr(sport, R_DUART_INCHREG((uport->line) % 2));
+
+	/* Set rx/tx interrupt to FIFO available.  */
+	mode1 = read_sbdchn(sport, R_DUART_MODE_REG_1);
+	mode1 &= ~(M_DUART_RX_IRQ_SEL_RXFULL | M_DUART_TX_IRQ_SEL_TXEMPT);
+	write_sbdchn(sport, R_DUART_MODE_REG_1, mode1);
+
+	/* Disable tx, enable rx.  */
+	write_sbdchn(sport, R_DUART_CMD, M_DUART_TX_DIS | M_DUART_RX_EN);
+	sport->tx_stopped = 1;
+
+	/* Enable interrupts.  */
+	write_sbdshr(sport, R_DUART_IMRREG((uport->line) % 2),
+		     M_DUART_IMR_IN | M_DUART_IMR_RX);
+
+	return 0;
+}
+
+static void sbd_shutdown(struct uart_port *uport)
+{
+	struct sbd_port *sport = to_sport(uport);
+
+	write_sbdchn(sport, R_DUART_CMD, M_DUART_TX_DIS | M_DUART_RX_DIS);
+	sport->tx_stopped = 1;
+	free_irq(sport->port.irq, sport);
+}
+
+
+static void sbd_init_port(struct sbd_port *sport)
+{
+	struct uart_port *uport = &sport->port;
+
+	if (sport->initialised)
+		return;
+
+	/* There is no DUART reset feature, so just set some sane defaults.  */
+	write_sbdchn(sport, R_DUART_CMD, V_DUART_MISC_CMD_RESET_TX);
+	write_sbdchn(sport, R_DUART_CMD, V_DUART_MISC_CMD_RESET_RX);
+	write_sbdchn(sport, R_DUART_MODE_REG_1, V_DUART_BITS_PER_CHAR_8);
+	write_sbdchn(sport, R_DUART_MODE_REG_2, 0);
+	write_sbdchn(sport, R_DUART_FULL_CTL,
+		     V_DUART_INT_TIME(0) | V_DUART_SIG_FULL(15));
+	write_sbdchn(sport, R_DUART_OPCR_X, 0);
+	write_sbdchn(sport, R_DUART_AUXCTL_X, 0);
+	write_sbdshr(sport, R_DUART_IMRREG((uport->line) % 2), 0);
+
+	sport->initialised = 1;
+}
+
+static void sbd_set_termios(struct uart_port *uport, struct ktermios *termios,
+			    struct ktermios *old_termios)
+{
+	struct sbd_port *sport = to_sport(uport);
+	unsigned int mode1 = 0, mode2 = 0, aux = 0;
+	unsigned int mode1mask = 0, mode2mask = 0, auxmask = 0;
+	unsigned int oldmode1, oldmode2, oldaux;
+	unsigned int baud, brg;
+	unsigned int command;
+
+	mode1mask |= ~(M_DUART_PARITY_MODE | M_DUART_PARITY_TYPE_ODD |
+		       M_DUART_BITS_PER_CHAR);
+	mode2mask |= ~M_DUART_STOP_BIT_LEN_2;
+	auxmask |= ~M_DUART_CTS_CHNG_ENA;
+
+	/* Byte size.  */
+	switch (termios->c_cflag & CSIZE) {
+	case CS5:
+	case CS6:
+		/* Unsupported, leave unchanged.  */
+		mode1mask |= M_DUART_PARITY_MODE;
+		break;
+	case CS7:
+		mode1 |= V_DUART_BITS_PER_CHAR_7;
+		break;
+	case CS8:
+	default:
+		mode1 |= V_DUART_BITS_PER_CHAR_8;
+		break;
+	}
+
+	/* Parity and stop bits.  */
+	if (termios->c_cflag & CSTOPB)
+		mode2 |= M_DUART_STOP_BIT_LEN_2;
+	else
+		mode2 |= M_DUART_STOP_BIT_LEN_1;
+	if (termios->c_cflag & PARENB)
+		mode1 |= V_DUART_PARITY_MODE_ADD;
+	else
+		mode1 |= V_DUART_PARITY_MODE_NONE;
+	if (termios->c_cflag & PARODD)
+		mode1 |= M_DUART_PARITY_TYPE_ODD;
+	else
+		mode1 |= M_DUART_PARITY_TYPE_EVEN;
+
+	baud = uart_get_baud_rate(uport, termios, old_termios, 1200, 5000000);
+	brg = V_DUART_BAUD_RATE(baud);
+	/* The actual lower bound is 1221bps, so compensate.  */
+	if (brg > M_DUART_CLK_COUNTER)
+		brg = M_DUART_CLK_COUNTER;
+
+	uart_update_timeout(uport, termios->c_cflag, baud);
+
+	uport->read_status_mask = M_DUART_OVRUN_ERR;
+	if (termios->c_iflag & INPCK)
+		uport->read_status_mask |= M_DUART_FRM_ERR |
+					   M_DUART_PARITY_ERR;
+	if (termios->c_iflag & (BRKINT | PARMRK))
+		uport->read_status_mask |= M_DUART_RCVD_BRK;
+
+	uport->ignore_status_mask = 0;
+	if (termios->c_iflag & IGNPAR)
+		uport->ignore_status_mask |= M_DUART_FRM_ERR |
+					     M_DUART_PARITY_ERR;
+	if (termios->c_iflag & IGNBRK) {
+		uport->ignore_status_mask |= M_DUART_RCVD_BRK;
+		if (termios->c_iflag & IGNPAR)
+			uport->ignore_status_mask |= M_DUART_OVRUN_ERR;
+	}
+
+	if (termios->c_cflag & CREAD)
+		command = M_DUART_RX_EN;
+	else
+		command = M_DUART_RX_DIS;
+
+	if (termios->c_cflag & CRTSCTS)
+		aux |= M_DUART_CTS_CHNG_ENA;
+	else
+		aux &= ~M_DUART_CTS_CHNG_ENA;
+
+	spin_lock(&uport->lock);
+
+	if (sport->tx_stopped)
+		command |= M_DUART_TX_DIS;
+	else
+		command |= M_DUART_TX_EN;
+
+	oldmode1 = read_sbdchn(sport, R_DUART_MODE_REG_1) & mode1mask;
+	oldmode2 = read_sbdchn(sport, R_DUART_MODE_REG_2) & mode2mask;
+	oldaux = read_sbdchn(sport, R_DUART_AUXCTL_X) & auxmask;
+
+	if (!sport->tx_stopped)
+		sbd_line_drain(sport);
+	write_sbdchn(sport, R_DUART_CMD, M_DUART_TX_DIS | M_DUART_RX_DIS);
+
+	write_sbdchn(sport, R_DUART_MODE_REG_1, mode1 | oldmode1);
+	write_sbdchn(sport, R_DUART_MODE_REG_2, mode2 | oldmode2);
+	write_sbdchn(sport, R_DUART_CLK_SEL, brg);
+	write_sbdchn(sport, R_DUART_AUXCTL_X, aux | oldaux);
+
+	write_sbdchn(sport, R_DUART_CMD, command);
+
+	spin_unlock(&uport->lock);
+}
+
+
+static const char *sbd_type(struct uart_port *uport)
+{
+	return "SB1250 DUART";
+}
+
+static void sbd_release_port(struct uart_port *uport)
+{
+	struct sbd_port *sport = to_sport(uport);
+	struct sbd_duart *duart = sport->duart;
+	int map_guard;
+
+	iounmap(sport->memctrl);
+	sport->memctrl = NULL;
+	iounmap(uport->membase);
+	uport->membase = NULL;
+
+	map_guard = atomic_add_return(-1, &duart->map_guard);
+	if (!map_guard)
+		release_mem_region(duart->mapctrl, DUART_CHANREG_SPACING);
+	release_mem_region(uport->mapbase, DUART_CHANREG_SPACING);
+}
+
+static int sbd_map_port(struct uart_port *uport)
+{
+	static const char *err = KERN_ERR "sbd: Cannot map MMIO\n";
+	struct sbd_port *sport = to_sport(uport);
+	struct sbd_duart *duart = sport->duart;
+
+	if (!uport->membase)
+		uport->membase = ioremap_nocache(uport->mapbase,
+						 DUART_CHANREG_SPACING);
+	if (!uport->membase) {
+		printk(err);
+		return -ENOMEM;
+	}
+
+	if (!sport->memctrl)
+		sport->memctrl = ioremap_nocache(duart->mapctrl,
+						 DUART_CHANREG_SPACING);
+	if (!sport->memctrl) {
+		printk(err);
+		iounmap(uport->membase);
+		uport->membase = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int sbd_request_port(struct uart_port *uport)
+{
+	static const char *err = KERN_ERR
+				 "sbd: Unable to reserve MMIO resource\n";
+	struct sbd_duart *duart = to_sport(uport)->duart;
+	int map_guard;
+	int ret = 0;
+
+	if (!request_mem_region(uport->mapbase, DUART_CHANREG_SPACING,
+				"sb1250-duart")) {
+		printk(err);
+		return -EBUSY;
+	}
+	map_guard = atomic_add_return(1, &duart->map_guard);
+	if (map_guard == 1) {
+		if (!request_mem_region(duart->mapctrl, DUART_CHANREG_SPACING,
+					"sb1250-duart")) {
+			atomic_add(-1, &duart->map_guard);
+			printk(err);
+			ret = -EBUSY;
+		}
+	}
+	if (!ret) {
+		ret = sbd_map_port(uport);
+		if (ret) {
+			map_guard = atomic_add_return(-1, &duart->map_guard);
+			if (!map_guard)
+				release_mem_region(duart->mapctrl,
+						   DUART_CHANREG_SPACING);
+		}
+	}
+	if (ret) {
+		release_mem_region(uport->mapbase, DUART_CHANREG_SPACING);
+		return ret;
+	}
+	return 0;
+}
+
+static void sbd_config_port(struct uart_port *uport, int flags)
+{
+	struct sbd_port *sport = to_sport(uport);
+
+	if (flags & UART_CONFIG_TYPE) {
+		if (sbd_request_port(uport))
+			return;
+
+		uport->type = PORT_SB1250_DUART;
+
+		sbd_init_port(sport);
+	}
+}
+
+static int sbd_verify_port(struct uart_port *uport, struct serial_struct *ser)
+{
+	int ret = 0;
+
+	if (ser->type != PORT_UNKNOWN && ser->type != PORT_SB1250_DUART)
+		ret = -EINVAL;
+	if (ser->irq != uport->irq)
+		ret = -EINVAL;
+	if (ser->baud_base != uport->uartclk / 16)
+		ret = -EINVAL;
+	return ret;
+}
+
+
+static struct uart_ops sbd_ops = {
+	.tx_empty	= sbd_tx_empty,
+	.set_mctrl	= sbd_set_mctrl,
+	.get_mctrl	= sbd_get_mctrl,
+	.stop_tx	= sbd_stop_tx,
+	.start_tx	= sbd_start_tx,
+	.stop_rx	= sbd_stop_rx,
+	.enable_ms	= sbd_enable_ms,
+	.break_ctl	= sbd_break_ctl,
+	.startup	= sbd_startup,
+	.shutdown	= sbd_shutdown,
+	.set_termios	= sbd_set_termios,
+	.type		= sbd_type,
+	.release_port	= sbd_release_port,
+	.request_port	= sbd_request_port,
+	.config_port	= sbd_config_port,
+	.verify_port	= sbd_verify_port,
+};
+
+/* Initialize SB1250 DUART port structures.  */
+static void __init sbd_probe_duarts(void)
+{
+	static int probed;
+	int chip, side;
+	int max_lines, line;
+
+	if (probed)
+		return;
+
+	/* Set the number of available units based on the SOC type.  */
+	switch (soc_type) {
+	case K_SYS_SOC_TYPE_BCM1x55:
+	case K_SYS_SOC_TYPE_BCM1x80:
+		max_lines = 4;
+		break;
+	default:
+		/* Assume at least two serial ports at the normal address.  */
+		max_lines = 2;
+		break;
+	}
+
+	probed = 1;
+
+	for (chip = 0, line = 0; chip < DUART_MAX_CHIP && line < max_lines;
+	     chip++) {
+		sbd_duarts[chip].mapctrl = SBD_CTRLREGS(line);
+
+		for (side = 0; side < DUART_MAX_SIDE && line < max_lines;
+		     side++, line++) {
+			struct sbd_port *sport = &sbd_duarts[chip].sport[side];
+			struct uart_port *uport = &sport->port;
+
+			sport->duart	= &sbd_duarts[chip];
+
+			uport->irq	= SBD_INT(line);
+			uport->uartclk	= 100000000 / 20 * 16;
+			uport->fifosize	= 16;
+			uport->iotype	= UPIO_MEM;
+			uport->flags	= UPF_BOOT_AUTOCONF;
+			uport->ops	= &sbd_ops;
+			uport->line	= line;
+			uport->mapbase	= SBD_CHANREGS(line);
+		}
+	}
+}
+
+
+#ifdef CONFIG_SERIAL_SB1250_DUART_CONSOLE
+/*
+ * Serial console stuff.  Very basic, polling driver for doing serial
+ * console output.  The console_sem is held by the caller, so we
+ * shouldn't be interrupted for more console activity.
+ */
+static void sbd_console_putchar(struct uart_port *uport, int ch)
+{
+	struct sbd_port *sport = to_sport(uport);
+
+	sbd_transmit_drain(sport);
+	write_sbdchn(sport, R_DUART_TX_HOLD, ch);
+}
+
+static void sbd_console_write(struct console *co, const char *s,
+			      unsigned int count)
+{
+	int chip = co->index / DUART_MAX_SIDE;
+	int side = co->index % DUART_MAX_SIDE;
+	struct sbd_port *sport = &sbd_duarts[chip].sport[side];
+	struct uart_port *uport = &sport->port;
+	unsigned long flags;
+	unsigned int mask;
+
+	/* Disable transmit interrupts and enable the transmitter. */
+	spin_lock_irqsave(&uport->lock, flags);
+	mask = read_sbdshr(sport, R_DUART_IMRREG((uport->line) % 2));
+	write_sbdshr(sport, R_DUART_IMRREG((uport->line) % 2),
+		     mask & ~M_DUART_IMR_TX);
+	write_sbdchn(sport, R_DUART_CMD, M_DUART_TX_EN);
+	spin_unlock_irqrestore(&uport->lock, flags);
+
+	uart_console_write(&sport->port, s, count, sbd_console_putchar);
+
+	/* Restore transmit interrupts and the transmitter enable. */
+	spin_lock_irqsave(&uport->lock, flags);
+	sbd_line_drain(sport);
+	if (sport->tx_stopped)
+		write_sbdchn(sport, R_DUART_CMD, M_DUART_TX_DIS);
+	write_sbdshr(sport, R_DUART_IMRREG((uport->line) % 2), mask);
+	spin_unlock_irqrestore(&uport->lock, flags);
+}
+
+static int __init sbd_console_setup(struct console *co, char *options)
+{
+	int chip = co->index / DUART_MAX_SIDE;
+	int side = co->index % DUART_MAX_SIDE;
+	struct sbd_port *sport = &sbd_duarts[chip].sport[side];
+	struct uart_port *uport = &sport->port;
+	int baud = 115200;
+	int bits = 8;
+	int parity = 'n';
+	int flow = 'n';
+	int ret;
+
+	if (!sport->duart)
+		return -ENXIO;
+
+	ret = sbd_map_port(uport);
+	if (ret)
+		return ret;
+
+	sbd_init_port(sport);
+
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+	return uart_set_options(uport, co, baud, parity, bits, flow);
+}
+
+static struct uart_driver sbd_reg;
+static struct console sbd_console = {
+	.name	= "duart",
+	.write	= sbd_console_write,
+	.device	= uart_console_device,
+	.setup	= sbd_console_setup,
+	.flags	= CON_PRINTBUFFER,
+	.index	= -1,
+	.data	= &sbd_reg
+};
+
+static int __init sbd_serial_console_init(void)
+{
+	sbd_probe_duarts();
+	register_console(&sbd_console);
+
+	return 0;
+}
+
+console_initcall(sbd_serial_console_init);
+
+#define SERIAL_SB1250_DUART_CONSOLE	&sbd_console
+#else
+#define SERIAL_SB1250_DUART_CONSOLE	NULL
+#endif /* CONFIG_SERIAL_SB1250_DUART_CONSOLE */
+
+
+static struct uart_driver sbd_reg = {
+	.owner		= THIS_MODULE,
+	.driver_name	= "serial",
+	.dev_name	= "duart",
+	.major		= TTY_MAJOR,
+	.minor		= SB1250_DUART_MINOR_BASE,
+	.nr		= DUART_MAX_CHIP * DUART_MAX_SIDE,
+	.cons		= SERIAL_SB1250_DUART_CONSOLE,
+};
+
+/* Set up the driver and register it.  */
+static int __init sbd_init(void)
+{
+	int i, ret;
+
+	sbd_probe_duarts();
+
+	ret = uart_register_driver(&sbd_reg);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < DUART_MAX_CHIP * DUART_MAX_SIDE; i++) {
+		struct sbd_duart *duart = &sbd_duarts[i / DUART_MAX_SIDE];
+		struct sbd_port *sport = &duart->sport[i % DUART_MAX_SIDE];
+		struct uart_port *uport = &sport->port;
+
+		if (sport->duart)
+			uart_add_one_port(&sbd_reg, uport);
+	}
+
+	return 0;
+}
+
+/* Unload the driver.  Unregister stuff, get ready to go away.  */
+static void __exit sbd_exit(void)
+{
+	int i;
+
+	for (i = DUART_MAX_CHIP * DUART_MAX_SIDE - 1; i >= 0; i--) {
+		struct sbd_duart *duart = &sbd_duarts[i / DUART_MAX_SIDE];
+		struct sbd_port *sport = &duart->sport[i % DUART_MAX_SIDE];
+		struct uart_port *uport = &sport->port;
+
+		if (sport->duart)
+			uart_remove_one_port(&sbd_reg, uport);
+	}
+
+	uart_unregister_driver(&sbd_reg);
+}
+
+module_init(sbd_init);
+module_exit(sbd_exit);
diff --git a/include/asm-mips/sibyte/bcm1480_regs.h b/include/asm-mips/sibyte/bcm1480_regs.h
index bda391d3af85..2738c1366f66 100644
--- a/include/asm-mips/sibyte/bcm1480_regs.h
+++ b/include/asm-mips/sibyte/bcm1480_regs.h
@@ -220,17 +220,25 @@
 #define A_BCM1480_DUART(chan)               ((((chan)&2) == 0)? A_BCM1480_DUART0 : A_BCM1480_DUART1)
 
 #define BCM1480_DUART_CHANREG_SPACING       0x100
-#define A_BCM1480_DUART_CHANREG(chan,reg)   (A_BCM1480_DUART(chan) \
-                                     + BCM1480_DUART_CHANREG_SPACING*((chan)&1) \
-                                     + (reg))
-#define R_BCM1480_DUART_CHANREG(chan,reg)   (BCM1480_DUART_CHANREG_SPACING*((chan)&1) + (reg))
-
-#define R_BCM1480_DUART_IMRREG(chan)	    (R_DUART_IMR_A + ((chan)&1)*DUART_IMRISR_SPACING)
-#define R_BCM1480_DUART_ISRREG(chan)	    (R_DUART_ISR_A + ((chan)&1)*DUART_IMRISR_SPACING)
-
-#define A_BCM1480_DUART_IMRREG(chan)	    (A_BCM1480_DUART(chan) + R_BCM1480_DUART_IMRREG(chan))
-#define A_BCM1480_DUART_ISRREG(chan)	    (A_BCM1480_DUART(chan) + R_BCM1480_DUART_ISRREG(chan))
-#define A_BCM1480_DUART_IN_PORT(chan)       (A_BCM1480_DUART(chan) + R_DUART_INP_ORT)
+#define A_BCM1480_DUART_CHANREG(chan, reg)				\
+	(A_BCM1480_DUART(chan) +					\
+	 BCM1480_DUART_CHANREG_SPACING * (((chan) & 1) + 1) + (reg))
+#define A_BCM1480_DUART_CTRLREG(chan, reg)				\
+	(A_BCM1480_DUART(chan) +					\
+	 BCM1480_DUART_CHANREG_SPACING * 3 + (reg))
+
+#define R_BCM1480_DUART_IMRREG(chan)					\
+	(R_DUART_IMR_A + ((chan) & 1) * DUART_IMRISR_SPACING)
+#define R_BCM1480_DUART_ISRREG(chan)					\
+	(R_DUART_ISR_A + ((chan) & 1) * DUART_IMRISR_SPACING)
+
+#define A_BCM1480_DUART_IMRREG(chan)					\
+	(A_BCM1480_DUART_CTRLREG((chan), R_BCM1480_DUART_IMRREG(chan)))
+#define A_BCM1480_DUART_ISRREG(chan)					\
+	(A_BCM1480_DUART_CTRLREG((chan), R_BCM1480_DUART_ISRREG(chan)))
+
+#define A_BCM1480_DUART_IN_PORT(chan)					\
+	(A_BCM1480_DUART_CTRLREG((chan), R_DUART_IN_PORT))
 
 /*
  * These constants are the absolute addresses.
diff --git a/include/asm-mips/sibyte/sb1250_regs.h b/include/asm-mips/sibyte/sb1250_regs.h
index da7c188993c9..220b7e94f1bf 100644
--- a/include/asm-mips/sibyte/sb1250_regs.h
+++ b/include/asm-mips/sibyte/sb1250_regs.h
@@ -272,59 +272,69 @@
     ********************************************************************* */
 
 
-#if SIBYTE_HDR_FEATURE_1250_112x		/* This MC only on 1250 & 112x */
+#if SIBYTE_HDR_FEATURE_1250_112x    /* This MC only on 1250 & 112x */
 #define R_DUART_NUM_PORTS           2
 
 #define A_DUART                     0x0010060000
 
 #define DUART_CHANREG_SPACING       0x100
-#define A_DUART_CHANREG(chan,reg)   (A_DUART + DUART_CHANREG_SPACING*(chan) + (reg))
-#define R_DUART_CHANREG(chan,reg)   (DUART_CHANREG_SPACING*(chan) + (reg))
+
+#define A_DUART_CHANREG(chan, reg)					\
+	(A_DUART + DUART_CHANREG_SPACING * ((chan) + 1) + (reg))
 #endif	/* 1250 & 112x */
 
-#define R_DUART_MODE_REG_1	    0x100
-#define R_DUART_MODE_REG_2	    0x110
-#define R_DUART_STATUS              0x120
-#define R_DUART_CLK_SEL             0x130
-#define R_DUART_CMD                 0x150
-#define R_DUART_RX_HOLD             0x160
-#define R_DUART_TX_HOLD             0x170
+#define R_DUART_MODE_REG_1	    0x000
+#define R_DUART_MODE_REG_2	    0x010
+#define R_DUART_STATUS		    0x020
+#define R_DUART_CLK_SEL		    0x030
+#define R_DUART_CMD		    0x050
+#define R_DUART_RX_HOLD		    0x060
+#define R_DUART_TX_HOLD		    0x070
 
 #if SIBYTE_HDR_FEATURE(1250, PASS2) || SIBYTE_HDR_FEATURE(112x, PASS1) || SIBYTE_HDR_FEATURE_CHIP(1480)
-#define R_DUART_FULL_CTL	    0x140
-#define R_DUART_OPCR_X		    0x180
-#define R_DUART_AUXCTL_X	    0x190
-#endif /* 1250 PASS2 || 112x PASS1 || 1480*/
+#define R_DUART_FULL_CTL	    0x040
+#define R_DUART_OPCR_X		    0x080
+#define R_DUART_AUXCTL_X	    0x090
+#endif /* 1250 PASS2 || 112x PASS1 || 1480 */
 
 
 /*
  * The IMR and ISR can't be addressed with A_DUART_CHANREG,
- * so use this macro instead.
+ * so use these macros instead.
  */
 
-#define R_DUART_AUX_CTRL            0x310
-#define R_DUART_ISR_A               0x320
-#define R_DUART_IMR_A               0x330
-#define R_DUART_ISR_B               0x340
-#define R_DUART_IMR_B               0x350
-#define R_DUART_OUT_PORT            0x360
-#define R_DUART_OPCR                0x370
-#define R_DUART_IN_PORT             0x380
+#if SIBYTE_HDR_FEATURE_1250_112x    /* This MC only on 1250 & 112x */
+#define DUART_IMRISR_SPACING	    0x20
+#define DUART_INCHNG_SPACING	    0x10
 
-#define R_DUART_SET_OPR		    0x3B0
-#define R_DUART_CLEAR_OPR	    0x3C0
+#define A_DUART_CTRLREG(reg)						\
+	(A_DUART + DUART_CHANREG_SPACING * 3 + (reg))
 
-#define DUART_IMRISR_SPACING        0x20
+#define R_DUART_IMRREG(chan)						\
+	(R_DUART_IMR_A + (chan) * DUART_IMRISR_SPACING)
+#define R_DUART_ISRREG(chan)						\
+	(R_DUART_ISR_A + (chan) * DUART_IMRISR_SPACING)
+#define R_DUART_INCHREG(chan)						\
+	(R_DUART_IN_CHNG_A + (chan) * DUART_INCHNG_SPACING)
 
-#if SIBYTE_HDR_FEATURE_1250_112x		/* This MC only on 1250 & 112x */
-#define R_DUART_IMRREG(chan)	    (R_DUART_IMR_A + (chan)*DUART_IMRISR_SPACING)
-#define R_DUART_ISRREG(chan)	    (R_DUART_ISR_A + (chan)*DUART_IMRISR_SPACING)
-
-#define A_DUART_IMRREG(chan)	    (A_DUART + R_DUART_IMRREG(chan))
-#define A_DUART_ISRREG(chan)	    (A_DUART + R_DUART_ISRREG(chan))
+#define A_DUART_IMRREG(chan)	    A_DUART_CTRLREG(R_DUART_IMRREG(chan))
+#define A_DUART_ISRREG(chan)	    A_DUART_CTRLREG(R_DUART_ISRREG(chan))
+#define A_DUART_INCHREG(chan)	    A_DUART_CTRLREG(R_DUART_INCHREG(chan))
 #endif	/* 1250 & 112x */
 
-
+#define R_DUART_AUX_CTRL	    0x010
+#define R_DUART_ISR_A		    0x020
+#define R_DUART_IMR_A		    0x030
+#define R_DUART_ISR_B		    0x040
+#define R_DUART_IMR_B		    0x050
+#define R_DUART_OUT_PORT	    0x060
+#define R_DUART_OPCR		    0x070
+#define R_DUART_IN_PORT		    0x080
+
+#define R_DUART_SET_OPR		    0x0B0
+#define R_DUART_CLEAR_OPR	    0x0C0
+#define R_DUART_IN_CHNG_A	    0x0D0
+#define R_DUART_IN_CHNG_B	    0x0E0
 
 
 /*
diff --git a/include/asm-mips/sibyte/sb1250_uart.h b/include/asm-mips/sibyte/sb1250_uart.h
index e87045e62bf0..cf74fedcbef1 100644
--- a/include/asm-mips/sibyte/sb1250_uart.h
+++ b/include/asm-mips/sibyte/sb1250_uart.h
@@ -75,7 +75,8 @@
 #define V_DUART_PARITY_MODE_ADD_FIXED V_DUART_PARITY_MODE(K_DUART_PARITY_MODE_ADD_FIXED)
 #define V_DUART_PARITY_MODE_NONE      V_DUART_PARITY_MODE(K_DUART_PARITY_MODE_NONE)
 
-#define M_DUART_ERR_MODE            _SB_MAKEMASK1(5)    /* must be zero */
+#define M_DUART_TX_IRQ_SEL_TXRDY    0
+#define M_DUART_TX_IRQ_SEL_TXEMPT   _SB_MAKEMASK1(5)
 
 #define M_DUART_RX_IRQ_SEL_RXRDY    0
 #define M_DUART_RX_IRQ_SEL_RXFULL   _SB_MAKEMASK1(6)
@@ -246,10 +247,13 @@
 
 #define M_DUART_ISR_BRK_A           _SB_MAKEMASK1(2)
 #define M_DUART_ISR_IN_A            _SB_MAKEMASK1(3)
+#define M_DUART_ISR_ALL_A	    _SB_MAKEMASK(4,0)
+
 #define M_DUART_ISR_TX_B            _SB_MAKEMASK1(4)
 #define M_DUART_ISR_RX_B            _SB_MAKEMASK1(5)
 #define M_DUART_ISR_BRK_B           _SB_MAKEMASK1(6)
 #define M_DUART_ISR_IN_B            _SB_MAKEMASK1(7)
+#define M_DUART_ISR_ALL_B	    _SB_MAKEMASK(4,4)
 
 /*
  * DUART Channel A Interrupt Status Register (Table 10-17)
@@ -262,6 +266,7 @@
 #define M_DUART_ISR_RX              _SB_MAKEMASK1(1)
 #define M_DUART_ISR_BRK             _SB_MAKEMASK1(2)
 #define M_DUART_ISR_IN              _SB_MAKEMASK1(3)
+#define M_DUART_ISR_ALL		    _SB_MAKEMASK(4,0)
 #define M_DUART_ISR_RESERVED        _SB_MAKEMASK(4,4)
 
 /*
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 7f2c99d66e9d..9c721cd2c9d6 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -142,6 +142,9 @@
 /* Micrel KS8695 */
 #define PORT_KS8695	76
 
+/* Broadcom SB1250, etc. SOC */
+#define PORT_SB1250_DUART	77
+
 
 #ifdef __KERNEL__
 
-- 
cgit v1.3-8-gc7d7


From 9281acea6a3687ff0f262e0be31eac34895b95d7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 17 Jul 2007 04:03:51 -0700
Subject: kallsyms: make KSYM_NAME_LEN include space for trailing '\0'

KSYM_NAME_LEN is peculiar in that it does not include the space for the
trailing '\0', forcing all users to use KSYM_NAME_LEN + 1 when allocating
buffer.  This is nonsense and error-prone.  Moreover, when the caller
forgets that it's very likely to subtly bite back by corrupting the stack
because the last position of the buffer is always cleared to zero.

This patch increments KSYM_NAME_LEN by one and updates code accordingly.

* off-by-one bug in asm-powerpc/kprobes.h::kprobe_lookup_name() macro
  is fixed.

* Where MODULE_NAME_LEN and KSYM_NAME_LEN were used together,
  MODULE_NAME_LEN was treated as if it didn't include space for the
  trailing '\0'.  Fix it.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Paulo Marques <pmarques@grupopie.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/parisc/kernel/unwind.c   |  2 +-
 fs/proc/base.c                |  2 +-
 include/asm-powerpc/kprobes.h |  4 ++--
 include/linux/kallsyms.h      |  6 +++---
 kernel/kallsyms.c             | 16 ++++++++--------
 kernel/lockdep.c              |  4 ++--
 kernel/module.c               | 10 +++++-----
 kernel/time/timer_list.c      |  2 +-
 kernel/time/timer_stats.c     |  2 +-
 mm/slab.c                     |  2 +-
 scripts/kallsyms.c            |  4 ++--
 11 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c
index 322167737de7..cf780cb3b916 100644
--- a/arch/parisc/kernel/unwind.c
+++ b/arch/parisc/kernel/unwind.c
@@ -242,7 +242,7 @@ static void unwind_frame_regs(struct unwind_frame_info *info)
 #ifdef CONFIG_KALLSYMS
 		/* Handle some frequent special cases.... */
 		{
-			char symname[KSYM_NAME_LEN+1];
+			char symname[KSYM_NAME_LEN];
 			char *modname;
 
 			kallsyms_lookup(info->ip, NULL, NULL, &modname,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ae3627337a92..42cb4f5613b6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -283,7 +283,7 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer)
 static int proc_pid_wchan(struct task_struct *task, char *buffer)
 {
 	unsigned long wchan;
-	char symname[KSYM_NAME_LEN+1];
+	char symname[KSYM_NAME_LEN];
 
 	wchan = get_wchan(task);
 
diff --git a/include/asm-powerpc/kprobes.h b/include/asm-powerpc/kprobes.h
index b0e40ff32ee0..9537fda238b8 100644
--- a/include/asm-powerpc/kprobes.h
+++ b/include/asm-powerpc/kprobes.h
@@ -65,10 +65,10 @@ typedef unsigned int kprobe_opcode_t;
 		} else if (name[0] != '.')				\
 			addr = *(kprobe_opcode_t **)addr;		\
 	} else {							\
-		char dot_name[KSYM_NAME_LEN+1];				\
+		char dot_name[KSYM_NAME_LEN];				\
 		dot_name[0] = '.';					\
 		dot_name[1] = '\0';					\
-		strncat(dot_name, name, KSYM_NAME_LEN);			\
+		strncat(dot_name, name, KSYM_NAME_LEN - 2);		\
 		addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name); \
 	}								\
 }
diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index 5f06527dca21..f73de6fb5c68 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -7,9 +7,9 @@
 
 #include <linux/errno.h>
 
-#define KSYM_NAME_LEN 127
-#define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + KSYM_NAME_LEN +	\
-			 2*(BITS_PER_LONG*3/10) + MODULE_NAME_LEN + 1)
+#define KSYM_NAME_LEN 128
+#define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \
+			 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1)
 
 #ifdef CONFIG_KALLSYMS
 /* Lookup the address for a symbol. Returns 0 if not found. */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 0d662475dd9f..474219a41929 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -152,7 +152,7 @@ static unsigned int get_symbol_offset(unsigned long pos)
 /* Lookup the address for this symbol. Returns 0 if not found. */
 unsigned long kallsyms_lookup_name(const char *name)
 {
-	char namebuf[KSYM_NAME_LEN+1];
+	char namebuf[KSYM_NAME_LEN];
 	unsigned long i;
 	unsigned int off;
 
@@ -248,7 +248,7 @@ const char *kallsyms_lookup(unsigned long addr,
 {
 	const char *msym;
 
-	namebuf[KSYM_NAME_LEN] = 0;
+	namebuf[KSYM_NAME_LEN - 1] = 0;
 	namebuf[0] = 0;
 
 	if (is_ksym_addr(addr)) {
@@ -265,7 +265,7 @@ const char *kallsyms_lookup(unsigned long addr,
 	/* see if it's in a module */
 	msym = module_address_lookup(addr, symbolsize, offset, modname);
 	if (msym)
-		return strncpy(namebuf, msym, KSYM_NAME_LEN);
+		return strncpy(namebuf, msym, KSYM_NAME_LEN - 1);
 
 	return NULL;
 }
@@ -273,7 +273,7 @@ const char *kallsyms_lookup(unsigned long addr,
 int lookup_symbol_name(unsigned long addr, char *symname)
 {
 	symname[0] = '\0';
-	symname[KSYM_NAME_LEN] = '\0';
+	symname[KSYM_NAME_LEN - 1] = '\0';
 
 	if (is_ksym_addr(addr)) {
 		unsigned long pos;
@@ -291,7 +291,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
 			unsigned long *offset, char *modname, char *name)
 {
 	name[0] = '\0';
-	name[KSYM_NAME_LEN] = '\0';
+	name[KSYM_NAME_LEN - 1] = '\0';
 
 	if (is_ksym_addr(addr)) {
 		unsigned long pos;
@@ -312,7 +312,7 @@ int sprint_symbol(char *buffer, unsigned long address)
 	char *modname;
 	const char *name;
 	unsigned long offset, size;
-	char namebuf[KSYM_NAME_LEN+1];
+	char namebuf[KSYM_NAME_LEN];
 
 	name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
 	if (!name)
@@ -342,8 +342,8 @@ struct kallsym_iter
 	unsigned long value;
 	unsigned int nameoff; /* If iterating in core kernel symbols */
 	char type;
-	char name[KSYM_NAME_LEN+1];
-	char module_name[MODULE_NAME_LEN + 1];
+	char name[KSYM_NAME_LEN];
+	char module_name[MODULE_NAME_LEN];
 	int exported;
 };
 
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1a5ff2211d88..edba2ffb43de 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -379,7 +379,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4
 
 static void print_lock_name(struct lock_class *class)
 {
-	char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4;
+	char str[KSYM_NAME_LEN], c1, c2, c3, c4;
 	const char *name;
 
 	get_usage_chars(class, &c1, &c2, &c3, &c4);
@@ -401,7 +401,7 @@ static void print_lock_name(struct lock_class *class)
 static void print_lockdep_cache(struct lockdep_map *lock)
 {
 	const char *name;
-	char str[KSYM_NAME_LEN + 1];
+	char str[KSYM_NAME_LEN];
 
 	name = lock->name;
 	if (!name)
diff --git a/kernel/module.c b/kernel/module.c
index 539fed9ac83c..33c04ad51175 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2133,7 +2133,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
 			sym = get_ksymbol(mod, addr, NULL, NULL);
 			if (!sym)
 				goto out;
-			strlcpy(symname, sym, KSYM_NAME_LEN + 1);
+			strlcpy(symname, sym, KSYM_NAME_LEN);
 			mutex_unlock(&module_mutex);
 			return 0;
 		}
@@ -2158,9 +2158,9 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
 			if (!sym)
 				goto out;
 			if (modname)
-				strlcpy(modname, mod->name, MODULE_NAME_LEN + 1);
+				strlcpy(modname, mod->name, MODULE_NAME_LEN);
 			if (name)
-				strlcpy(name, sym, KSYM_NAME_LEN + 1);
+				strlcpy(name, sym, KSYM_NAME_LEN);
 			mutex_unlock(&module_mutex);
 			return 0;
 		}
@@ -2181,8 +2181,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 			*value = mod->symtab[symnum].st_value;
 			*type = mod->symtab[symnum].st_info;
 			strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
-				KSYM_NAME_LEN + 1);
-			strlcpy(module_name, mod->name, MODULE_NAME_LEN + 1);
+				KSYM_NAME_LEN);
+			strlcpy(module_name, mod->name, MODULE_NAME_LEN);
 			*exported = is_exported(name, mod);
 			mutex_unlock(&module_mutex);
 			return 0;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 8bbcfb77f7d2..e5edc3a22a08 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -38,7 +38,7 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
 
 static void print_name_offset(struct seq_file *m, void *sym)
 {
-	char symname[KSYM_NAME_LEN+1];
+	char symname[KSYM_NAME_LEN];
 
 	if (lookup_symbol_name((unsigned long)sym, symname) < 0)
 		SEQ_printf(m, "<%p>", sym);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 9b8a826236dd..8ed62fda16c6 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -269,7 +269,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
 
 static void print_name_offset(struct seq_file *m, unsigned long addr)
 {
-	char symname[KSYM_NAME_LEN+1];
+	char symname[KSYM_NAME_LEN];
 
 	if (lookup_symbol_name(addr, symname) < 0)
 		seq_printf(m, "<%p>", (void *)addr);
diff --git a/mm/slab.c b/mm/slab.c
index 35056394139b..96d30ee256ef 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4344,7 +4344,7 @@ static void show_symbol(struct seq_file *m, unsigned long address)
 {
 #ifdef CONFIG_KALLSYMS
 	unsigned long offset, size;
-	char modname[MODULE_NAME_LEN + 1], name[KSYM_NAME_LEN + 1];
+	char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
 
 	if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
 		seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
index 8b809b264d18..10b006694e5d 100644
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -31,7 +31,7 @@
 #include <string.h>
 #include <ctype.h>
 
-#define KSYM_NAME_LEN		127
+#define KSYM_NAME_LEN		128
 
 
 struct sym_entry {
@@ -254,7 +254,7 @@ static void write_src(void)
 	unsigned int i, k, off;
 	unsigned int best_idx[256];
 	unsigned int *markers;
-	char buf[KSYM_NAME_LEN+1];
+	char buf[KSYM_NAME_LEN];
 
 	printf("#include <asm/types.h>\n");
 	printf("#if BITS_PER_LONG == 64\n");
-- 
cgit v1.3-8-gc7d7


From 5b78cc9ac8602baafebb75a09025ffb17d1aebc2 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@computergmbh.de>
Date: Tue, 17 Jul 2007 04:03:53 -0700
Subject: make timespec_equal() take const arguments

Make arguments of timespec_equal() const struct timespec.

Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/time.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 4bb05a829be9..ec3b0ced0afe 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -36,7 +36,8 @@ struct timezone {
 #define NSEC_PER_SEC	1000000000L
 #define FSEC_PER_SEC	1000000000000000L
 
-static inline int timespec_equal(struct timespec *a, struct timespec *b)
+static inline int timespec_equal(const struct timespec *a,
+                                 const struct timespec *b)
 {
 	return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec);
 }
-- 
cgit v1.3-8-gc7d7


From dccd573bb02aa011a4a7146c02c409ac0bd722a0 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Tue, 17 Jul 2007 04:04:02 -0700
Subject: SPI controller drivers: check for unsupported modes

Minor SPI controller driver updates: make the setup() methods reject
spi->mode bits they don't support, by masking aginst the inverse of bits
they *do* support.  This insures against misbehavior later when new mode
bits get added.

Most controllers can't support SPI_LSB_FIRST; more handle SPI_CS_HIGH.
Support for all four SPI clock/transfer modes is routine.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/spi/atmel_spi.c         |  1 +
 drivers/spi/au1550_spi.c        |  9 +++++++++
 drivers/spi/mpc52xx_psc_spi.c   |  9 +++++++++
 drivers/spi/omap_uwire.c        |  9 +++++++++
 drivers/spi/pxa2xx_spi.c        |  9 +++++++++
 drivers/spi/spi_bitbang.c       |  8 +++-----
 drivers/spi/spi_imx.c           | 24 +++++++++---------------
 drivers/spi/spi_mpc83xx.c       |  9 +++++++++
 drivers/spi/spi_s3c24xx.c       |  8 +++++++-
 include/linux/spi/spi_bitbang.h |  1 +
 10 files changed, 66 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/atmel_spi.c b/drivers/spi/atmel_spi.c
index 8b2601de3630..7524a84bdd50 100644
--- a/drivers/spi/atmel_spi.c
+++ b/drivers/spi/atmel_spi.c
@@ -350,6 +350,7 @@ atmel_spi_interrupt(int irq, void *dev_id)
 	return ret;
 }
 
+/* the spi->mode bits understood by this driver: */
 #define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH)
 
 static int atmel_spi_setup(struct spi_device *spi)
diff --git a/drivers/spi/au1550_spi.c b/drivers/spi/au1550_spi.c
index ae2b1af0dba4..c47a650183a1 100644
--- a/drivers/spi/au1550_spi.c
+++ b/drivers/spi/au1550_spi.c
@@ -280,6 +280,9 @@ static int au1550_spi_setupxfer(struct spi_device *spi, struct spi_transfer *t)
 	return 0;
 }
 
+/* the spi->mode bits understood by this driver: */
+#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST)
+
 static int au1550_spi_setup(struct spi_device *spi)
 {
 	struct au1550_spi *hw = spi_master_get_devdata(spi->master);
@@ -292,6 +295,12 @@ static int au1550_spi_setup(struct spi_device *spi)
 		return -EINVAL;
 	}
 
+	if (spi->mode & ~MODEBITS) {
+		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
+			spi->mode & ~MODEBITS);
+		return -EINVAL;
+	}
+
 	if (spi->max_speed_hz == 0)
 		spi->max_speed_hz = hw->freq_max;
 	if (spi->max_speed_hz > hw->freq_max
diff --git a/drivers/spi/mpc52xx_psc_spi.c b/drivers/spi/mpc52xx_psc_spi.c
index 11f36bef3057..d2a4b2bdb07b 100644
--- a/drivers/spi/mpc52xx_psc_spi.c
+++ b/drivers/spi/mpc52xx_psc_spi.c
@@ -270,6 +270,9 @@ static void mpc52xx_psc_spi_work(struct work_struct *work)
 	spin_unlock_irq(&mps->lock);
 }
 
+/* the spi->mode bits understood by this driver: */
+#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH | SPI_LSB_FIRST)
+
 static int mpc52xx_psc_spi_setup(struct spi_device *spi)
 {
 	struct mpc52xx_psc_spi *mps = spi_master_get_devdata(spi->master);
@@ -279,6 +282,12 @@ static int mpc52xx_psc_spi_setup(struct spi_device *spi)
 	if (spi->bits_per_word%8)
 		return -EINVAL;
 
+	if (spi->mode & ~MODEBITS) {
+		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
+			spi->mode & ~MODEBITS);
+		return -EINVAL;
+	}
+
 	if (!cs) {
 		cs = kzalloc(sizeof *cs, GFP_KERNEL);
 		if (!cs)
diff --git a/drivers/spi/omap_uwire.c b/drivers/spi/omap_uwire.c
index 95183e1df525..d275c615a73e 100644
--- a/drivers/spi/omap_uwire.c
+++ b/drivers/spi/omap_uwire.c
@@ -445,10 +445,19 @@ done:
 	return status;
 }
 
+/* the spi->mode bits understood by this driver: */
+#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH)
+
 static int uwire_setup(struct spi_device *spi)
 {
 	struct uwire_state *ust = spi->controller_state;
 
+	if (spi->mode & ~MODEBITS) {
+		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
+			spi->mode & ~MODEBITS);
+		return -EINVAL;
+	}
+
 	if (ust == NULL) {
 		ust = kzalloc(sizeof(*ust), GFP_KERNEL);
 		if (ust == NULL)
diff --git a/drivers/spi/pxa2xx_spi.c b/drivers/spi/pxa2xx_spi.c
index 9f2c887ffa04..e51311b2da0b 100644
--- a/drivers/spi/pxa2xx_spi.c
+++ b/drivers/spi/pxa2xx_spi.c
@@ -1067,6 +1067,9 @@ static int transfer(struct spi_device *spi, struct spi_message *msg)
 	return 0;
 }
 
+/* the spi->mode bits understood by this driver: */
+#define MODEBITS (SPI_CPOL | SPI_CPHA)
+
 static int setup(struct spi_device *spi)
 {
 	struct pxa2xx_spi_chip *chip_info = NULL;
@@ -1093,6 +1096,12 @@ static int setup(struct spi_device *spi)
 		return -EINVAL;
 	}
 
+	if (spi->mode & ~MODEBITS) {
+		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
+			spi->mode & ~MODEBITS);
+		return -EINVAL;
+	}
+
 	/* Only alloc on first setup */
 	chip = spi_get_ctldata(spi);
 	if (!chip) {
diff --git a/drivers/spi/spi_bitbang.c b/drivers/spi/spi_bitbang.c
index 88425e1af4d3..0c85c984ccb4 100644
--- a/drivers/spi/spi_bitbang.c
+++ b/drivers/spi/spi_bitbang.c
@@ -187,12 +187,10 @@ int spi_bitbang_setup(struct spi_device *spi)
 
 	bitbang = spi_master_get_devdata(spi->master);
 
-	/* REVISIT: some systems will want to support devices using lsb-first
-	 * bit encodings on the wire.  In pure software that would be trivial,
-	 * just bitbang_txrx_le_cphaX() routines shifting the other way, and
-	 * some hardware controllers also have this support.
+	/* Bitbangers can support SPI_CS_HIGH, SPI_3WIRE, and so on;
+	 * add those to master->flags, and provide the other support.
 	 */
-	if ((spi->mode & SPI_LSB_FIRST) != 0)
+	if ((spi->mode & ~(SPI_CPOL|SPI_CPHA|bitbang->flags)) != 0)
 		return -EINVAL;
 
 	if (!cs) {
diff --git a/drivers/spi/spi_imx.c b/drivers/spi/spi_imx.c
index 656be4a5094a..aee9ad6f633c 100644
--- a/drivers/spi/spi_imx.c
+++ b/drivers/spi/spi_imx.c
@@ -1163,6 +1163,9 @@ msg_rejected:
 	return -EINVAL;
 }
 
+/* the spi->mode bits understood by this driver: */
+#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH)
+
 /* On first setup bad values must free chip_data memory since will cause
    spi_new_device to fail. Bad value setup from protocol driver are simply not
    applied and notified to the calling driver. */
@@ -1174,6 +1177,12 @@ static int setup(struct spi_device *spi)
 	u32 tmp;
 	int status = 0;
 
+	if (spi->mode & ~MODEBITS) {
+		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
+			spi->mode & ~MODEBITS);
+		return -EINVAL;
+	}
+
 	/* Get controller data */
 	chip_info = spi->controller_data;
 
@@ -1245,21 +1254,6 @@ static int setup(struct spi_device *spi)
 
 	/* SPI mode */
 	tmp = spi->mode;
-	if (tmp & SPI_LSB_FIRST) {
-		status = -EINVAL;
-		if (first_setup) {
-			dev_err(&spi->dev,
-				"setup - "
-				"HW doesn't support LSB first transfer\n");
-			goto err_first_setup;
-		} else {
-			dev_err(&spi->dev,
-				"setup - "
-				"HW doesn't support LSB first transfer, "
-				"default to MSB first\n");
-			spi->mode &= ~SPI_LSB_FIRST;
-		}
-	}
 	if (tmp & SPI_CS_HIGH) {
 		u32_EDIT(chip->control,
 				SPI_CONTROL_SSPOL, SPI_CONTROL_SSPOL_ACT_HIGH);
diff --git a/drivers/spi/spi_mpc83xx.c b/drivers/spi/spi_mpc83xx.c
index e9798bf7b8c6..9cdbc12278e5 100644
--- a/drivers/spi/spi_mpc83xx.c
+++ b/drivers/spi/spi_mpc83xx.c
@@ -232,12 +232,21 @@ int mpc83xx_spi_setup_transfer(struct spi_device *spi, struct spi_transfer *t)
 	return 0;
 }
 
+/* the spi->mode bits understood by this driver: */
+#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH)
+
 static int mpc83xx_spi_setup(struct spi_device *spi)
 {
 	struct spi_bitbang *bitbang;
 	struct mpc83xx_spi *mpc83xx_spi;
 	int retval;
 
+	if (spi->mode & ~MODEBITS) {
+		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
+			spi->mode & ~MODEBITS);
+		return -EINVAL;
+	}
+
 	if (!spi->max_speed_hz)
 		return -EINVAL;
 
diff --git a/drivers/spi/spi_s3c24xx.c b/drivers/spi/spi_s3c24xx.c
index d5a710f6e445..7071ff8da63e 100644
--- a/drivers/spi/spi_s3c24xx.c
+++ b/drivers/spi/spi_s3c24xx.c
@@ -146,6 +146,9 @@ static int s3c24xx_spi_setupxfer(struct spi_device *spi,
 	return 0;
 }
 
+/* the spi->mode bits understood by this driver: */
+#define MODEBITS (SPI_CPOL | SPI_CPHA | SPI_CS_HIGH)
+
 static int s3c24xx_spi_setup(struct spi_device *spi)
 {
 	int ret;
@@ -153,8 +156,11 @@ static int s3c24xx_spi_setup(struct spi_device *spi)
 	if (!spi->bits_per_word)
 		spi->bits_per_word = 8;
 
-	if ((spi->mode & SPI_LSB_FIRST) != 0)
+	if (spi->mode & ~MODEBITS) {
+		dev_dbg(&spi->dev, "setup: unsupported mode bits %x\n",
+			spi->mode & ~MODEBITS);
 		return -EINVAL;
+	}
 
 	ret = s3c24xx_spi_setupxfer(spi, NULL);
 	if (ret < 0) {
diff --git a/include/linux/spi/spi_bitbang.h b/include/linux/spi/spi_bitbang.h
index 9dbca629dcfb..b8db32cea1de 100644
--- a/include/linux/spi/spi_bitbang.h
+++ b/include/linux/spi/spi_bitbang.h
@@ -26,6 +26,7 @@ struct spi_bitbang {
 	struct list_head	queue;
 	u8			busy;
 	u8			use_dma;
+	u8			flags;		/* extra spi->mode support */
 
 	struct spi_master	*master;
 
-- 
cgit v1.3-8-gc7d7


From c06e677aed0c86480b01faa894967daa8aa3568a Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Tue, 17 Jul 2007 04:04:03 -0700
Subject: SPI: add 3wire mode flag

Add a new spi->mode bit: SPI_3WIRE, for chips where the SI and SO signals
are shared (and which are thus only half duplex).  Update the LM70 driver
to require support for that hardware mode from the controller.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/hwmon/lm70.c    | 4 ++++
 include/linux/spi/spi.h | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hwmon/lm70.c b/drivers/hwmon/lm70.c
index 7eaae3834e15..275d392eca61 100644
--- a/drivers/hwmon/lm70.c
+++ b/drivers/hwmon/lm70.c
@@ -96,6 +96,10 @@ static int __devinit lm70_probe(struct spi_device *spi)
 	struct lm70 *p_lm70;
 	int status;
 
+	/* signaling is SPI_MODE_0 on a 3-wire link (shared SI/SO) */
+	if ((spi->mode & (SPI_CPOL|SPI_CPHA)) || !(spi->mode & SPI_3WIRE))
+		return -EINVAL;
+
 	p_lm70 = kzalloc(sizeof *p_lm70, GFP_KERNEL);
 	if (!p_lm70)
 		return -ENOMEM;
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 1be5ea059477..302b81d1d117 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -76,6 +76,7 @@ struct spi_device {
 #define	SPI_MODE_3	(SPI_CPOL|SPI_CPHA)
 #define	SPI_CS_HIGH	0x04			/* chipselect active high? */
 #define	SPI_LSB_FIRST	0x08			/* per-word bits-on-wire */
+#define	SPI_3WIRE	0x10			/* SI/SO signals shared */
 	u8			bits_per_word;
 	int			irq;
 	void			*controller_state;
-- 
cgit v1.3-8-gc7d7


From ad241528c4919505afccb022acbab3eeb0db4d80 Mon Sep 17 00:00:00 2001
From: Jan Nikitenko <jan.nikitenko@gmail.com>
Date: Tue, 17 Jul 2007 04:04:03 -0700
Subject: CRC7 support

Add CRC7 routines, used for example in MMC over SPI communication.
Kerneldoc updates

[akpm@linux-foundation.org: fix funny mix of const and non-const]
Signed-off-by: Jan Nikitenko <jan.nikitenko@gmail.com>
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: "Randy.Dunlap" <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/DocBook/kernel-api.tmpl |  4 ++-
 include/linux/crc7.h                  | 14 ++++++++
 lib/Kconfig                           |  8 +++++
 lib/Makefile                          |  1 +
 lib/crc7.c                            | 68 +++++++++++++++++++++++++++++++++++
 5 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/crc7.h
 create mode 100644 lib/crc7.c

(limited to 'include/linux')

diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index 46bcff2849bd..fd2ef4d29b6d 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -139,8 +139,10 @@ X!Ilib/string.c
 !Elib/cmdline.c
      </sect1>
 
-     <sect1><title>CRC Functions</title>
+     <sect1 id="crc"><title>CRC Functions</title>
+!Elib/crc7.c
 !Elib/crc16.c
+!Elib/crc-itu-t.c
 !Elib/crc32.c
 !Elib/crc-ccitt.c
      </sect1>
diff --git a/include/linux/crc7.h b/include/linux/crc7.h
new file mode 100644
index 000000000000..1786e772d5c6
--- /dev/null
+++ b/include/linux/crc7.h
@@ -0,0 +1,14 @@
+#ifndef _LINUX_CRC7_H
+#define _LINUX_CRC7_H
+#include <linux/types.h>
+
+extern const u8 crc7_syndrome_table[256];
+
+static inline u8 crc7_byte(u8 crc, u8 data)
+{
+	return crc7_syndrome_table[(crc << 1) ^ data];
+}
+
+extern u8 crc7(u8 crc, const u8 *buffer, size_t len);
+
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index 3eb29d5dc4f5..e5c2c514174a 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -41,6 +41,14 @@ config CRC32
 	  kernel tree does. Such modules that use library CRC32 functions
 	  require M here.
 
+config CRC7
+	tristate "CRC7 functions"
+	help
+	  This option is provided for the case where no in-kernel-tree
+	  modules require CRC7 functions, but a module built outside
+	  the kernel tree does. Such modules that use library CRC7
+	  functions require M here.
+
 config LIBCRC32C
 	tristate "CRC32c (Castagnoli, et al) Cyclic Redundancy-Check"
 	help
diff --git a/lib/Makefile b/lib/Makefile
index 8363b60be9dd..da68b2ca0606 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_CRC_CCITT)	+= crc-ccitt.o
 obj-$(CONFIG_CRC16)	+= crc16.o
 obj-$(CONFIG_CRC_ITU_T)	+= crc-itu-t.o
 obj-$(CONFIG_CRC32)	+= crc32.o
+obj-$(CONFIG_CRC7)	+= crc7.o
 obj-$(CONFIG_LIBCRC32C)	+= libcrc32c.o
 obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
 
diff --git a/lib/crc7.c b/lib/crc7.c
new file mode 100644
index 000000000000..f1c3a144cec1
--- /dev/null
+++ b/lib/crc7.c
@@ -0,0 +1,68 @@
+/*
+ *      crc7.c
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc7.h>
+
+
+/* Table for CRC-7 (polynomial x^7 + x^3 + 1) */
+const u8 crc7_syndrome_table[256] = {
+	0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f,
+	0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77,
+	0x19, 0x10, 0x0b, 0x02, 0x3d, 0x34, 0x2f, 0x26,
+	0x51, 0x58, 0x43, 0x4a, 0x75, 0x7c, 0x67, 0x6e,
+	0x32, 0x3b, 0x20, 0x29, 0x16, 0x1f, 0x04, 0x0d,
+	0x7a, 0x73, 0x68, 0x61, 0x5e, 0x57, 0x4c, 0x45,
+	0x2b, 0x22, 0x39, 0x30, 0x0f, 0x06, 0x1d, 0x14,
+	0x63, 0x6a, 0x71, 0x78, 0x47, 0x4e, 0x55, 0x5c,
+	0x64, 0x6d, 0x76, 0x7f, 0x40, 0x49, 0x52, 0x5b,
+	0x2c, 0x25, 0x3e, 0x37, 0x08, 0x01, 0x1a, 0x13,
+	0x7d, 0x74, 0x6f, 0x66, 0x59, 0x50, 0x4b, 0x42,
+	0x35, 0x3c, 0x27, 0x2e, 0x11, 0x18, 0x03, 0x0a,
+	0x56, 0x5f, 0x44, 0x4d, 0x72, 0x7b, 0x60, 0x69,
+	0x1e, 0x17, 0x0c, 0x05, 0x3a, 0x33, 0x28, 0x21,
+	0x4f, 0x46, 0x5d, 0x54, 0x6b, 0x62, 0x79, 0x70,
+	0x07, 0x0e, 0x15, 0x1c, 0x23, 0x2a, 0x31, 0x38,
+	0x41, 0x48, 0x53, 0x5a, 0x65, 0x6c, 0x77, 0x7e,
+	0x09, 0x00, 0x1b, 0x12, 0x2d, 0x24, 0x3f, 0x36,
+	0x58, 0x51, 0x4a, 0x43, 0x7c, 0x75, 0x6e, 0x67,
+	0x10, 0x19, 0x02, 0x0b, 0x34, 0x3d, 0x26, 0x2f,
+	0x73, 0x7a, 0x61, 0x68, 0x57, 0x5e, 0x45, 0x4c,
+	0x3b, 0x32, 0x29, 0x20, 0x1f, 0x16, 0x0d, 0x04,
+	0x6a, 0x63, 0x78, 0x71, 0x4e, 0x47, 0x5c, 0x55,
+	0x22, 0x2b, 0x30, 0x39, 0x06, 0x0f, 0x14, 0x1d,
+	0x25, 0x2c, 0x37, 0x3e, 0x01, 0x08, 0x13, 0x1a,
+	0x6d, 0x64, 0x7f, 0x76, 0x49, 0x40, 0x5b, 0x52,
+	0x3c, 0x35, 0x2e, 0x27, 0x18, 0x11, 0x0a, 0x03,
+	0x74, 0x7d, 0x66, 0x6f, 0x50, 0x59, 0x42, 0x4b,
+	0x17, 0x1e, 0x05, 0x0c, 0x33, 0x3a, 0x21, 0x28,
+	0x5f, 0x56, 0x4d, 0x44, 0x7b, 0x72, 0x69, 0x60,
+	0x0e, 0x07, 0x1c, 0x15, 0x2a, 0x23, 0x38, 0x31,
+	0x46, 0x4f, 0x54, 0x5d, 0x62, 0x6b, 0x70, 0x79
+};
+EXPORT_SYMBOL(crc7_syndrome_table);
+
+/**
+ * crc7 - update the CRC7 for the data buffer
+ * @crc:     previous CRC7 value
+ * @buffer:  data pointer
+ * @len:     number of bytes in the buffer
+ * Context: any
+ *
+ * Returns the updated CRC7 value.
+ */
+u8 crc7(u8 crc, const u8 *buffer, size_t len)
+{
+	while (len--)
+		crc = crc7_byte(crc, *buffer++);
+	return crc;
+}
+EXPORT_SYMBOL(crc7);
+
+MODULE_DESCRIPTION("CRC7 calculations");
+MODULE_LICENSE("GPL");
-- 
cgit v1.3-8-gc7d7


From 447aef1a19135a69bfd725c33f7e753740cb8447 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben@trinity.fluff.org>
Date: Tue, 17 Jul 2007 04:04:10 -0700
Subject: SPI: tle620x power switch driver

Add support for the Infineon TLE62x0 series of low-side driver chips, such
as the TLE6220 or TLE6230.  These can be viewed as output GPIOs specialized
for power switching applications.  The driver provides a userspace
interface to those GPIOs, and to the switch status they provide.

Signed-off-by: Ben Dooks <ben-linux@fluff.org>
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/spi/Kconfig         |   9 ++
 drivers/spi/Makefile        |   1 +
 drivers/spi/tle62x0.c       | 328 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/spi/tle62x0.h |  24 ++++
 4 files changed, 362 insertions(+)
 create mode 100644 drivers/spi/tle62x0.c
 create mode 100644 include/linux/spi/tle62x0.h

(limited to 'include/linux')

diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig
index 8ce16a0b8a50..a49270eae660 100644
--- a/drivers/spi/Kconfig
+++ b/drivers/spi/Kconfig
@@ -198,6 +198,15 @@ config SPI_SPIDEV
 	  Note that this application programming interface is EXPERIMENTAL
 	  and hence SUBJECT TO CHANGE WITHOUT NOTICE while it stabilizes.
 
+config SPI_TLE62X0
+	tristate "Infineon TLE62X0 (for power switching)"
+	depends on SPI_MASTER && SYSFS
+	help
+	  SPI driver for Infineon TLE62X0 series line driver chips,
+	  such as the TLE6220, TLE6230 and TLE6240.  This provides a
+	  sysfs interface, with each line presented as a kind of GPIO
+	  exposing both switch control and diagnostic feedback.
+
 #
 # Add new SPI protocol masters in alphabetical order above this line
 #
diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile
index 89293338f20c..d9bd7f605d6a 100644
--- a/drivers/spi/Makefile
+++ b/drivers/spi/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_SPI_S3C24XX)		+= spi_s3c24xx.o
 # SPI protocol drivers (device/link on bus)
 obj-$(CONFIG_SPI_AT25)		+= at25.o
 obj-$(CONFIG_SPI_SPIDEV)	+= spidev.o
+obj-$(CONFIG_SPI_TLE62X0)	+= tle62x0.o
 # 	... add above this line ...
 
 # SPI slave controller drivers (upstream link)
diff --git a/drivers/spi/tle62x0.c b/drivers/spi/tle62x0.c
new file mode 100644
index 000000000000..6da58ca48b33
--- /dev/null
+++ b/drivers/spi/tle62x0.c
@@ -0,0 +1,328 @@
+/*
+ * tle62x0.c -- support Infineon TLE62x0 driver chips
+ *
+ * Copyright (c) 2007 Simtec Electronics
+ *	Ben Dooks, <ben@simtec.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+
+#include <linux/spi/spi.h>
+#include <linux/spi/tle62x0.h>
+
+
+#define CMD_READ	0x00
+#define CMD_SET		0xff
+
+#define DIAG_NORMAL	0x03
+#define DIAG_OVERLOAD	0x02
+#define DIAG_OPEN	0x01
+#define DIAG_SHORTGND	0x00
+
+struct tle62x0_state {
+	struct spi_device	*us;
+	struct mutex		lock;
+	unsigned int		nr_gpio;
+	unsigned int		gpio_state;
+
+	unsigned char		tx_buff[4];
+	unsigned char		rx_buff[4];
+};
+
+static int to_gpio_num(struct device_attribute *attr);
+
+static inline int tle62x0_write(struct tle62x0_state *st)
+{
+	unsigned char *buff = st->tx_buff;
+	unsigned int gpio_state = st->gpio_state;
+
+	buff[0] = CMD_SET;
+
+	if (st->nr_gpio == 16) {
+		buff[1] = gpio_state >> 8;
+		buff[2] = gpio_state;
+	} else {
+		buff[1] = gpio_state;
+	}
+
+	dev_dbg(&st->us->dev, "buff %02x,%02x,%02x\n",
+		buff[0], buff[1], buff[2]);
+
+	return spi_write(st->us, buff, (st->nr_gpio == 16) ? 3 : 2);
+}
+
+static inline int tle62x0_read(struct tle62x0_state *st)
+{
+	unsigned char *txbuff = st->tx_buff;
+	struct spi_transfer xfer = {
+		.tx_buf		= txbuff,
+		.rx_buf		= st->rx_buff,
+		.len		= (st->nr_gpio * 2) / 8,
+	};
+	struct spi_message msg;
+
+	txbuff[0] = CMD_READ;
+	txbuff[1] = 0x00;
+	txbuff[2] = 0x00;
+	txbuff[3] = 0x00;
+
+	spi_message_init(&msg);
+	spi_message_add_tail(&xfer, &msg);
+
+	return spi_sync(st->us, &msg);
+}
+
+static unsigned char *decode_fault(unsigned int fault_code)
+{
+	fault_code &= 3;
+
+	switch (fault_code) {
+	case DIAG_NORMAL:
+		return "N";
+	case DIAG_OVERLOAD:
+		return "V";
+	case DIAG_OPEN:
+		return "O";
+	case DIAG_SHORTGND:
+		return "G";
+	}
+
+	return "?";
+}
+
+static ssize_t tle62x0_status_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct tle62x0_state *st = dev_get_drvdata(dev);
+	char *bp = buf;
+	unsigned char *buff = st->rx_buff;
+	unsigned long fault = 0;
+	int ptr;
+	int ret;
+
+	mutex_lock(&st->lock);
+	ret = tle62x0_read(st);
+
+	dev_dbg(dev, "tle62x0_read() returned %d\n", ret);
+
+	for (ptr = 0; ptr < (st->nr_gpio * 2)/8; ptr += 1) {
+		fault <<= 8;
+		fault  |= ((unsigned long)buff[ptr]);
+
+		dev_dbg(dev, "byte %d is %02x\n", ptr, buff[ptr]);
+	}
+
+	for (ptr = 0; ptr < st->nr_gpio; ptr++) {
+		bp += sprintf(bp, "%s ", decode_fault(fault >> (ptr * 2)));
+	}
+
+	*bp++ = '\n';
+
+	mutex_unlock(&st->lock);
+	return bp - buf;
+}
+
+static DEVICE_ATTR(status_show, S_IRUGO, tle62x0_status_show, NULL);
+
+static ssize_t tle62x0_gpio_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct tle62x0_state *st = dev_get_drvdata(dev);
+	int gpio_num = to_gpio_num(attr);
+	int value;
+
+	mutex_lock(&st->lock);
+	value = (st->gpio_state >> gpio_num) & 1;
+	mutex_unlock(&st->lock);
+
+	return snprintf(buf, PAGE_SIZE, "%d", value);
+}
+
+static ssize_t tle62x0_gpio_store(struct device *dev,
+		struct device_attribute *attr,
+		const char *buf, size_t len)
+{
+	struct tle62x0_state *st = dev_get_drvdata(dev);
+	int gpio_num = to_gpio_num(attr);
+	unsigned long val;
+	char *endp;
+
+	val = simple_strtoul(buf, &endp, 0);
+	if (buf == endp)
+		return -EINVAL;
+
+	dev_dbg(dev, "setting gpio %d to %ld\n", gpio_num, val);
+
+	mutex_lock(&st->lock);
+
+	if (val)
+		st->gpio_state |= 1 << gpio_num;
+	else
+		st->gpio_state &= ~(1 << gpio_num);
+
+	tle62x0_write(st);
+	mutex_unlock(&st->lock);
+
+	return len;
+}
+
+static DEVICE_ATTR(gpio1, S_IWUSR|S_IRUGO,
+		tle62x0_gpio_show, tle62x0_gpio_store);
+static DEVICE_ATTR(gpio2, S_IWUSR|S_IRUGO,
+		tle62x0_gpio_show, tle62x0_gpio_store);
+static DEVICE_ATTR(gpio3, S_IWUSR|S_IRUGO,
+		tle62x0_gpio_show, tle62x0_gpio_store);
+static DEVICE_ATTR(gpio4, S_IWUSR|S_IRUGO,
+		tle62x0_gpio_show, tle62x0_gpio_store);
+static DEVICE_ATTR(gpio5, S_IWUSR|S_IRUGO,
+		tle62x0_gpio_show, tle62x0_gpio_store);
+static DEVICE_ATTR(gpio6, S_IWUSR|S_IRUGO,
+		tle62x0_gpio_show, tle62x0_gpio_store);
+static DEVICE_ATTR(gpio7, S_IWUSR|S_IRUGO,
+		tle62x0_gpio_show, tle62x0_gpio_store);
+static DEVICE_ATTR(gpio8, S_IWUSR|S_IRUGO,
+		tle62x0_gpio_show, tle62x0_gpio_store);
+static DEVICE_ATTR(gpio9, S_IWUSR|S_IRUGO,
+		tle62x0_gpio_show, tle62x0_gpio_store);
+static DEVICE_ATTR(gpio10, S_IWUSR|S_IRUGO,
+		tle62x0_gpio_show, tle62x0_gpio_store);
+static DEVICE_ATTR(gpio11, S_IWUSR|S_IRUGO,
+		tle62x0_gpio_show, tle62x0_gpio_store);
+static DEVICE_ATTR(gpio12, S_IWUSR|S_IRUGO,
+		tle62x0_gpio_show, tle62x0_gpio_store);
+static DEVICE_ATTR(gpio13, S_IWUSR|S_IRUGO,
+		tle62x0_gpio_show, tle62x0_gpio_store);
+static DEVICE_ATTR(gpio14, S_IWUSR|S_IRUGO,
+		tle62x0_gpio_show, tle62x0_gpio_store);
+static DEVICE_ATTR(gpio15, S_IWUSR|S_IRUGO,
+		tle62x0_gpio_show, tle62x0_gpio_store);
+static DEVICE_ATTR(gpio16, S_IWUSR|S_IRUGO,
+		tle62x0_gpio_show, tle62x0_gpio_store);
+
+static struct device_attribute *gpio_attrs[] = {
+	[0]		= &dev_attr_gpio1,
+	[1]		= &dev_attr_gpio2,
+	[2]		= &dev_attr_gpio3,
+	[3]		= &dev_attr_gpio4,
+	[4]		= &dev_attr_gpio5,
+	[5]		= &dev_attr_gpio6,
+	[6]		= &dev_attr_gpio7,
+	[7]		= &dev_attr_gpio8,
+	[8]		= &dev_attr_gpio9,
+	[9]		= &dev_attr_gpio10,
+	[10]		= &dev_attr_gpio11,
+	[11]		= &dev_attr_gpio12,
+	[12]		= &dev_attr_gpio13,
+	[13]		= &dev_attr_gpio14,
+	[14]		= &dev_attr_gpio15,
+	[15]		= &dev_attr_gpio16
+};
+
+static int to_gpio_num(struct device_attribute *attr)
+{
+	int ptr;
+
+	for (ptr = 0; ptr < ARRAY_SIZE(gpio_attrs); ptr++) {
+		if (gpio_attrs[ptr] == attr)
+			return ptr;
+	}
+
+	return -1;
+}
+
+static int __devinit tle62x0_probe(struct spi_device *spi)
+{
+	struct tle62x0_state *st;
+	struct tle62x0_pdata *pdata;
+	int ptr;
+	int ret;
+
+	pdata = spi->dev.platform_data;
+	if (pdata == NULL) {
+		dev_err(&spi->dev, "no device data specified\n");
+		return -EINVAL;
+	}
+
+	st = kzalloc(sizeof(struct tle62x0_state), GFP_KERNEL);
+	if (st == NULL) {
+		dev_err(&spi->dev, "no memory for device state\n");
+		return -ENOMEM;
+	}
+
+	st->us = spi;
+	st->nr_gpio = pdata->gpio_count;
+	st->gpio_state = pdata->init_state;
+
+	mutex_init(&st->lock);
+
+	ret = device_create_file(&spi->dev, &dev_attr_status_show);
+	if (ret) {
+		dev_err(&spi->dev, "cannot create status attribute\n");
+		goto err_status;
+	}
+
+	for (ptr = 0; ptr < pdata->gpio_count; ptr++) {
+		ret = device_create_file(&spi->dev, gpio_attrs[ptr]);
+		if (ret) {
+			dev_err(&spi->dev, "cannot create gpio attribute\n");
+			goto err_gpios;
+		}
+	}
+
+	/* tle62x0_write(st); */
+	spi_set_drvdata(spi, st);
+	return 0;
+
+ err_gpios:
+	for (; ptr > 0; ptr--)
+		device_remove_file(&spi->dev, gpio_attrs[ptr]);
+
+	device_remove_file(&spi->dev, &dev_attr_status_show);
+
+ err_status:
+	kfree(st);
+	return ret;
+}
+
+static int __devexit tle62x0_remove(struct spi_device *spi)
+{
+	struct tle62x0_state *st = spi_get_drvdata(spi);
+	int ptr;
+
+	for (ptr = 0; ptr < st->nr_gpio; ptr++)
+		device_remove_file(&spi->dev, gpio_attrs[ptr]);
+
+	kfree(st);
+	return 0;
+}
+
+static struct spi_driver tle62x0_driver = {
+	.driver = {
+		.name	= "tle62x0",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= tle62x0_probe,
+	.remove		= __devexit_p(tle62x0_remove),
+};
+
+static __init int tle62x0_init(void)
+{
+	return spi_register_driver(&tle62x0_driver);
+}
+
+static __exit void tle62x0_exit(void)
+{
+	spi_unregister_driver(&tle62x0_driver);
+}
+
+module_init(tle62x0_init);
+module_exit(tle62x0_exit);
+
+MODULE_AUTHOR("Ben Dooks <ben@simtec.co.uk>");
+MODULE_DESCRIPTION("TLE62x0 SPI driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/spi/tle62x0.h b/include/linux/spi/tle62x0.h
new file mode 100644
index 000000000000..60b59187e590
--- /dev/null
+++ b/include/linux/spi/tle62x0.h
@@ -0,0 +1,24 @@
+/*
+ * tle62x0.h - platform glue to Infineon TLE62x0 driver chips
+ *
+ * Copyright 2007 Simtec Electronics
+ *	Ben Dooks <ben@simtec.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+struct tle62x0_pdata {
+	unsigned int		init_state;
+	unsigned int		gpio_count;
+};
-- 
cgit v1.3-8-gc7d7


From f29ba280ecb46331c1f6842b094808af01131422 Mon Sep 17 00:00:00 2001
From: Joakim Tjernlund <joakim.tjernlund@transmode.se>
Date: Tue, 17 Jul 2007 04:04:12 -0700
Subject: spi_mpc83xx.c: support QE enabled 83xx CPU's like mpc832x

Quicc Engine enabled mpc83xx CPU's has a somewhat different HW interface to
the SPI controller.  This patch adds a qe_mode knob that sees to that
needed adaptions are performed.

Signed-off-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/spi/spi_mpc83xx.c   | 32 +++++++++++++++++++++++++++++---
 include/linux/fsl_devices.h |  2 +-
 2 files changed, 30 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi_mpc83xx.c b/drivers/spi/spi_mpc83xx.c
index bbd51bc88a60..3295cfcc9f20 100644
--- a/drivers/spi/spi_mpc83xx.c
+++ b/drivers/spi/spi_mpc83xx.c
@@ -47,6 +47,7 @@ struct mpc83xx_spi_reg {
 #define	SPMODE_ENABLE		(1 << 24)
 #define	SPMODE_LEN(x)		((x) << 20)
 #define	SPMODE_PM(x)		((x) << 16)
+#define	SPMODE_OP		(1 << 14)
 
 /*
  * Default for SPI Mode:
@@ -85,6 +86,11 @@ struct mpc83xx_spi {
 	unsigned nsecs;		/* (clock cycle time)/2 */
 
 	u32 sysclk;
+	u32 rx_shift;		/* RX data reg shift when in qe mode */
+	u32 tx_shift;		/* TX data reg shift when in qe mode */
+
+	bool qe_mode;
+
 	void (*activate_cs) (u8 cs, u8 polarity);
 	void (*deactivate_cs) (u8 cs, u8 polarity);
 };
@@ -103,7 +109,7 @@ static inline u32 mpc83xx_spi_read_reg(__be32 __iomem * reg)
 void mpc83xx_spi_rx_buf_##type(u32 data, struct mpc83xx_spi *mpc83xx_spi) \
 {									  \
 	type * rx = mpc83xx_spi->rx;					  \
-	*rx++ = (type)data;						  \
+	*rx++ = (type)(data >> mpc83xx_spi->rx_shift);			  \
 	mpc83xx_spi->rx = rx;						  \
 }
 
@@ -114,7 +120,7 @@ u32 mpc83xx_spi_tx_buf_##type(struct mpc83xx_spi *mpc83xx_spi)	\
 	const type * tx = mpc83xx_spi->tx;			\
 	if (!tx)						\
 		return 0;					\
-	data = *tx++;						\
+	data = *tx++ << mpc83xx_spi->tx_shift;			\
 	mpc83xx_spi->tx = tx;					\
 	return data;						\
 }
@@ -203,12 +209,22 @@ int mpc83xx_spi_setup_transfer(struct spi_device *spi, struct spi_transfer *t)
 	    || ((bits_per_word > 16) && (bits_per_word != 32)))
 		return -EINVAL;
 
+	mpc83xx_spi->rx_shift = 0;
+	mpc83xx_spi->tx_shift = 0;
 	if (bits_per_word <= 8) {
 		mpc83xx_spi->get_rx = mpc83xx_spi_rx_buf_u8;
 		mpc83xx_spi->get_tx = mpc83xx_spi_tx_buf_u8;
+		if (mpc83xx_spi->qe_mode) {
+			mpc83xx_spi->rx_shift = 16;
+			mpc83xx_spi->tx_shift = 24;
+		}
 	} else if (bits_per_word <= 16) {
 		mpc83xx_spi->get_rx = mpc83xx_spi_rx_buf_u16;
 		mpc83xx_spi->get_tx = mpc83xx_spi_tx_buf_u16;
+		if (mpc83xx_spi->qe_mode) {
+			mpc83xx_spi->rx_shift = 16;
+			mpc83xx_spi->tx_shift = 16;
+		}
 	} else if (bits_per_word <= 32) {
 		mpc83xx_spi->get_rx = mpc83xx_spi_rx_buf_u32;
 		mpc83xx_spi->get_tx = mpc83xx_spi_tx_buf_u32;
@@ -386,7 +402,6 @@ static int __init mpc83xx_spi_probe(struct platform_device *dev)
 		ret = -ENODEV;
 		goto free_master;
 	}
-
 	mpc83xx_spi = spi_master_get_devdata(master);
 	mpc83xx_spi->bitbang.master = spi_master_get(master);
 	mpc83xx_spi->bitbang.chipselect = mpc83xx_spi_chipselect;
@@ -395,9 +410,17 @@ static int __init mpc83xx_spi_probe(struct platform_device *dev)
 	mpc83xx_spi->sysclk = pdata->sysclk;
 	mpc83xx_spi->activate_cs = pdata->activate_cs;
 	mpc83xx_spi->deactivate_cs = pdata->deactivate_cs;
+	mpc83xx_spi->qe_mode = pdata->qe_mode;
 	mpc83xx_spi->get_rx = mpc83xx_spi_rx_buf_u8;
 	mpc83xx_spi->get_tx = mpc83xx_spi_tx_buf_u8;
 
+	mpc83xx_spi->rx_shift = 0;
+	mpc83xx_spi->tx_shift = 0;
+	if (mpc83xx_spi->qe_mode) {
+		mpc83xx_spi->rx_shift = 16;
+		mpc83xx_spi->tx_shift = 24;
+	}
+
 	mpc83xx_spi->bitbang.master->setup = mpc83xx_spi_setup;
 	init_completion(&mpc83xx_spi->done);
 
@@ -432,6 +455,9 @@ static int __init mpc83xx_spi_probe(struct platform_device *dev)
 
 	/* Enable SPI interface */
 	regval = pdata->initial_spmode | SPMODE_INIT_VAL | SPMODE_ENABLE;
+	if (pdata->qe_mode)
+		regval |= SPMODE_OP;
+
 	mpc83xx_spi_write_reg(&mpc83xx_spi->base->mode, regval);
 
 	ret = spi_bitbang_start(&mpc83xx_spi->bitbang);
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index 12e631f0fb77..695741b0e420 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -112,7 +112,7 @@ struct fsl_usb2_platform_data {
 struct fsl_spi_platform_data {
 	u32 	initial_spmode;	/* initial SPMODE value */
 	u16	bus_num;
-
+	bool	qe_mode;
 	/* board specific information */
 	u16	max_chipselect;
 	void	(*activate_cs)(u8 cs, u8 polarity);
-- 
cgit v1.3-8-gc7d7


From 67837f232d6d55be99d6e0dec4ea9bb8112840cd Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <matthias.kaehlcke@gmail.com>
Date: Tue, 17 Jul 2007 04:04:16 -0700
Subject: Use mutex instead of semaphore in CAPI 2.0 driver

The CAPI 2.0 driver uses a semaphore as mutex.  Use the mutex API instead of
the (binary) semaphore.

Signed-off-by: Matthias Kaehlcke <matthias.kaehlcke@gmail.com>
Acked-by: Karsten Keil <kkeil@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/isdn/capi/kcapi.c  | 6 +++---
 include/linux/kernelcapi.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c
index 3ed34f7a1c4f..9f73bc2727c2 100644
--- a/drivers/isdn/capi/kcapi.c
+++ b/drivers/isdn/capi/kcapi.c
@@ -258,7 +258,7 @@ static void recv_handler(struct work_struct *work)
 	if ((!ap) || (ap->release_in_progress))
 		return;
 
-	down(&ap->recv_sem);
+	mutex_lock(&ap->recv_mtx);
 	while ((skb = skb_dequeue(&ap->recv_queue))) {
 		if (CAPIMSG_CMD(skb->data) == CAPI_DATA_B3_IND)
 			ap->nrecvdatapkt++;
@@ -267,7 +267,7 @@ static void recv_handler(struct work_struct *work)
 
 		ap->recv_message(ap, skb);
 	}
-	up(&ap->recv_sem);
+	mutex_unlock(&ap->recv_mtx);
 }
 
 void capi_ctr_handle_message(struct capi_ctr * card, u16 appl, struct sk_buff *skb)
@@ -547,7 +547,7 @@ u16 capi20_register(struct capi20_appl *ap)
 	ap->nsentctlpkt = 0;
 	ap->nsentdatapkt = 0;
 	ap->callback = NULL;
-	init_MUTEX(&ap->recv_sem);
+	mutex_init(&ap->recv_mtx);
 	skb_queue_head_init(&ap->recv_queue);
 	INIT_WORK(&ap->recv_work, recv_handler);
 	ap->release_in_progress = 0;
diff --git a/include/linux/kernelcapi.h b/include/linux/kernelcapi.h
index aea34e74c496..8c4350a9ed87 100644
--- a/include/linux/kernelcapi.h
+++ b/include/linux/kernelcapi.h
@@ -64,7 +64,7 @@ struct capi20_appl {
 	unsigned long nrecvdatapkt;
 	unsigned long nsentctlpkt;
 	unsigned long nsentdatapkt;
-	struct semaphore recv_sem;
+	struct mutex recv_mtx;
 	struct sk_buff_head recv_queue;
 	struct work_struct recv_work;
 	int release_in_progress;
-- 
cgit v1.3-8-gc7d7


From a569425512253992cc64ebf8b6d00a62f986db3e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 17 Jul 2007 04:04:28 -0700
Subject: knfsd: exportfs: add exportfs.h header

currently the export_operation structure and helpers related to it are in
fs.h.  fs.h is already far too large and there are very few places needing the
export bits, so split them off into a separate header.

[akpm@linux-foundation.org: fix cifs build]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Steven French <sfrench@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/cifs/export.c             |   1 +
 fs/efs/super.c               |   1 +
 fs/exportfs/expfs.c          |   1 +
 fs/ext2/super.c              |   1 +
 fs/ext3/super.c              |   1 +
 fs/ext4/super.c              |   1 +
 fs/fat/inode.c               |   1 +
 fs/gfs2/ops_export.c         |   1 +
 fs/isofs/isofs.h             |   1 +
 fs/jfs/super.c               |   1 +
 fs/nfsd/export.c             |   1 +
 fs/nfsd/nfsfh.c              |   1 +
 fs/ntfs/namei.c              |   1 +
 fs/ocfs2/export.h            |   2 +
 fs/reiserfs/inode.c          |   1 +
 fs/reiserfs/super.c          |   1 +
 fs/xfs/linux-2.6/xfs_super.h |   2 +
 include/linux/exportfs.h     | 119 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h           | 114 +----------------------------------------
 mm/shmem.c                   |   1 +
 20 files changed, 140 insertions(+), 113 deletions(-)
 create mode 100644 include/linux/exportfs.h

(limited to 'include/linux')

diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 1d716392c3aa..96df1d51fdc3 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -29,6 +29,7 @@
   */
 
 #include <linux/fs.h>
+#include <linux/exportfs.h>
  
 #ifdef CONFIG_CIFS_EXPERIMENTAL
  
diff --git a/fs/efs/super.c b/fs/efs/super.c
index e0a6839e68ae..67338d622f56 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -11,6 +11,7 @@
 #include <linux/efs_fs.h>
 #include <linux/efs_vh.h>
 #include <linux/efs_fs_sb.h>
+#include <linux/exportfs.h>
 #include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index e98f6cd7200c..16287af34859 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -1,4 +1,5 @@
 
+#include <linux/exportfs.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/module.h>
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index b2efd9083b9b..3eefa97fe204 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -25,6 +25,7 @@
 #include <linux/parser.h>
 #include <linux/random.h>
 #include <linux/buffer_head.h>
+#include <linux/exportfs.h>
 #include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/seq_file.h>
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 51d1c456cdab..4f84dc86628a 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -29,6 +29,7 @@
 #include <linux/parser.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/exportfs.h>
 #include <linux/vfs.h>
 #include <linux/random.h>
 #include <linux/mount.h>
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d0d8c76c7edb..b806e689c4aa 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -29,6 +29,7 @@
 #include <linux/parser.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/exportfs.h>
 #include <linux/vfs.h>
 #include <linux/random.h>
 #include <linux/mount.h>
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index cfaf5877d98b..0a7ddb39a593 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -20,6 +20,7 @@
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/buffer_head.h>
+#include <linux/exportfs.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
 #include <linux/parser.h>
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 99ea5659bc2c..b8312edee0e4 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -11,6 +11,7 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
+#include <linux/exportfs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index efe2872cd4e3..a07e67b1ea7f 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -1,5 +1,6 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
+#include <linux/exportfs.h>
 #include <linux/iso_fs.h>
 #include <asm/unaligned.h>
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 20e4ac1c79a3..49b652bd96ba 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -27,6 +27,7 @@
 #include <linux/kthread.h>
 #include <linux/posix_acl.h>
 #include <linux/buffer_head.h>
+#include <linux/exportfs.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
 
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 79bd03b8bbf8..c518421a9355 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -26,6 +26,7 @@
 #include <linux/mount.h>
 #include <linux/hash.h>
 #include <linux/module.h>
+#include <linux/exportfs.h>
 
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 6ca2d24fc216..0108d3ec1c27 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -15,6 +15,7 @@
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/dcache.h>
+#include <linux/exportfs.h>
 #include <linux/mount.h>
 
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index bff01a54675a..e93c6142b23c 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -21,6 +21,7 @@
  */
 
 #include <linux/dcache.h>
+#include <linux/exportfs.h>
 #include <linux/security.h>
 
 #include "attrib.h"
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
index 5b77ee7866ef..e08bed9e45a0 100644
--- a/fs/ocfs2/export.h
+++ b/fs/ocfs2/export.h
@@ -26,6 +26,8 @@
 #ifndef OCFS2_EXPORT_H
 #define OCFS2_EXPORT_H
 
+#include <linux/exportfs.h>
+
 extern struct export_operations ocfs2_export_ops;
 
 #endif /* OCFS2_EXPORT_H */
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 1272d11399fb..ddde489f1cb2 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -7,6 +7,7 @@
 #include <linux/reiserfs_fs.h>
 #include <linux/reiserfs_acl.h>
 #include <linux/reiserfs_xattr.h>
+#include <linux/exportfs.h>
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b4ac9119200e..5a93cfe1a032 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
+#include <linux/exportfs.h>
 #include <linux/vfs.h>
 #include <linux/mnt_namespace.h>
 #include <linux/mount.h>
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 33dd1ca13245..201cc3273c84 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -18,6 +18,8 @@
 #ifndef __XFS_SUPER_H__
 #define __XFS_SUPER_H__
 
+#include <linux/exportfs.h>
+
 #ifdef CONFIG_XFS_DMAPI
 # define vfs_insertdmapi(vfs)	vfs_insertops(vfsp, &xfs_dmops)
 # define vfs_initdmapi()	dmapi_init()
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
new file mode 100644
index 000000000000..fdc306fbba50
--- /dev/null
+++ b/include/linux/exportfs.h
@@ -0,0 +1,119 @@
+#ifndef LINUX_EXPORTFS_H
+#define LINUX_EXPORTFS_H 1
+
+#include <linux/types.h>
+
+struct dentry;
+struct super_block;
+
+
+/**
+ * struct export_operations - for nfsd to communicate with file systems
+ * @decode_fh:      decode a file handle fragment and return a &struct dentry
+ * @encode_fh:      encode a file handle fragment from a dentry
+ * @get_name:       find the name for a given inode in a given directory
+ * @get_parent:     find the parent of a given directory
+ * @get_dentry:     find a dentry for the inode given a file handle sub-fragment
+ * @find_exported_dentry:
+ *	set by the exporting module to a standard helper function.
+ *
+ * Description:
+ *    The export_operations structure provides a means for nfsd to communicate
+ *    with a particular exported file system  - particularly enabling nfsd and
+ *    the filesystem to co-operate when dealing with file handles.
+ *
+ *    export_operations contains two basic operation for dealing with file
+ *    handles, decode_fh() and encode_fh(), and allows for some other
+ *    operations to be defined which standard helper routines use to get
+ *    specific information from the filesystem.
+ *
+ *    nfsd encodes information use to determine which filesystem a filehandle
+ *    applies to in the initial part of the file handle.  The remainder, termed
+ *    a file handle fragment, is controlled completely by the filesystem.  The
+ *    standard helper routines assume that this fragment will contain one or
+ *    two sub-fragments, one which identifies the file, and one which may be
+ *    used to identify the (a) directory containing the file.
+ *
+ *    In some situations, nfsd needs to get a dentry which is connected into a
+ *    specific part of the file tree.  To allow for this, it passes the
+ *    function acceptable() together with a @context which can be used to see
+ *    if the dentry is acceptable.  As there can be multiple dentrys for a
+ *    given file, the filesystem should check each one for acceptability before
+ *    looking for the next.  As soon as an acceptable one is found, it should
+ *    be returned.
+ *
+ * decode_fh:
+ *    @decode_fh is given a &struct super_block (@sb), a file handle fragment
+ *    (@fh, @fh_len) and an acceptability testing function (@acceptable,
+ *    @context).  It should return a &struct dentry which refers to the same
+ *    file that the file handle fragment refers to,  and which passes the
+ *    acceptability test.  If it cannot, it should return a %NULL pointer if
+ *    the file was found but no acceptable &dentries were available, or a
+ *    %ERR_PTR error code indicating why it couldn't be found (e.g. %ENOENT or
+ *    %ENOMEM).
+ *
+ * encode_fh:
+ *    @encode_fh should store in the file handle fragment @fh (using at most
+ *    @max_len bytes) information that can be used by @decode_fh to recover the
+ *    file refered to by the &struct dentry @de.  If the @connectable flag is
+ *    set, the encode_fh() should store sufficient information so that a good
+ *    attempt can be made to find not only the file but also it's place in the
+ *    filesystem.   This typically means storing a reference to de->d_parent in
+ *    the filehandle fragment.  encode_fh() should return the number of bytes
+ *    stored or a negative error code such as %-ENOSPC
+ *
+ * get_name:
+ *    @get_name should find a name for the given @child in the given @parent
+ *    directory.  The name should be stored in the @name (with the
+ *    understanding that it is already pointing to a a %NAME_MAX+1 sized
+ *    buffer.   get_name() should return %0 on success, a negative error code
+ *    or error.  @get_name will be called without @parent->i_mutex held.
+ *
+ * get_parent:
+ *    @get_parent should find the parent directory for the given @child which
+ *    is also a directory.  In the event that it cannot be found, or storage
+ *    space cannot be allocated, a %ERR_PTR should be returned.
+ *
+ * get_dentry:
+ *    Given a &super_block (@sb) and a pointer to a file-system specific inode
+ *    identifier, possibly an inode number, (@inump) get_dentry() should find
+ *    the identified inode and return a dentry for that inode.  Any suitable
+ *    dentry can be returned including, if necessary, a new dentry created with
+ *    d_alloc_root.  The caller can then find any other extant dentrys by
+ *    following the d_alias links.  If a new dentry was created using
+ *    d_alloc_root, DCACHE_NFSD_DISCONNECTED should be set, and the dentry
+ *    should be d_rehash()ed.
+ *
+ *    If the inode cannot be found, either a %NULL pointer or an %ERR_PTR code
+ *    can be returned.  The @inump will be whatever was passed to
+ *    nfsd_find_fh_dentry() in either the @obj or @parent parameters.
+ *
+ * Locking rules:
+ *    get_parent is called with child->d_inode->i_mutex down
+ *    get_name is not (which is possibly inconsistent)
+ */
+
+struct export_operations {
+	struct dentry *(*decode_fh)(struct super_block *sb, __u32 *fh,
+			int fh_len, int fh_type,
+			int (*acceptable)(void *context, struct dentry *de),
+			void *context);
+	int (*encode_fh)(struct dentry *de, __u32 *fh, int *max_len,
+			int connectable);
+	int (*get_name)(struct dentry *parent, char *name,
+			struct dentry *child);
+	struct dentry * (*get_parent)(struct dentry *child);
+	struct dentry * (*get_dentry)(struct super_block *sb, void *inump);
+
+	/* This is set by the exporting module to a standard helper */
+	struct dentry * (*find_exported_dentry)(
+			struct super_block *sb, void *obj, void *parent,
+			int (*acceptable)(void *context, struct dentry *de),
+			void *context);
+};
+
+extern struct dentry *find_exported_dentry(struct super_block *sb, void *obj,
+	void *parent, int (*acceptable)(void *context, struct dentry *de),
+	void *context);
+
+#endif /* LINUX_EXPORTFS_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index aa74f7de9dcd..58ce336d4a6b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -289,6 +289,7 @@ extern int dir_notify_enable;
 #include <asm/semaphore.h>
 #include <asm/byteorder.h>
 
+struct export_operations;
 struct hd_geometry;
 struct iovec;
 struct nameidata;
@@ -1278,119 +1279,6 @@ static inline void file_accessed(struct file *file)
 
 int sync_inode(struct inode *inode, struct writeback_control *wbc);
 
-/**
- * struct export_operations - for nfsd to communicate with file systems
- * @decode_fh:      decode a file handle fragment and return a &struct dentry
- * @encode_fh:      encode a file handle fragment from a dentry
- * @get_name:       find the name for a given inode in a given directory
- * @get_parent:     find the parent of a given directory
- * @get_dentry:     find a dentry for the inode given a file handle sub-fragment
- * @find_exported_dentry:
- *	set by the exporting module to a standard helper function.
- *
- * Description:
- *    The export_operations structure provides a means for nfsd to communicate
- *    with a particular exported file system  - particularly enabling nfsd and
- *    the filesystem to co-operate when dealing with file handles.
- *
- *    export_operations contains two basic operation for dealing with file
- *    handles, decode_fh() and encode_fh(), and allows for some other
- *    operations to be defined which standard helper routines use to get
- *    specific information from the filesystem.
- *
- *    nfsd encodes information use to determine which filesystem a filehandle
- *    applies to in the initial part of the file handle.  The remainder, termed
- *    a file handle fragment, is controlled completely by the filesystem.  The
- *    standard helper routines assume that this fragment will contain one or
- *    two sub-fragments, one which identifies the file, and one which may be
- *    used to identify the (a) directory containing the file.
- *
- *    In some situations, nfsd needs to get a dentry which is connected into a
- *    specific part of the file tree.  To allow for this, it passes the
- *    function acceptable() together with a @context which can be used to see
- *    if the dentry is acceptable.  As there can be multiple dentrys for a
- *    given file, the filesystem should check each one for acceptability before
- *    looking for the next.  As soon as an acceptable one is found, it should
- *    be returned.
- *
- * decode_fh:
- *    @decode_fh is given a &struct super_block (@sb), a file handle fragment
- *    (@fh, @fh_len) and an acceptability testing function (@acceptable,
- *    @context).  It should return a &struct dentry which refers to the same
- *    file that the file handle fragment refers to,  and which passes the
- *    acceptability test.  If it cannot, it should return a %NULL pointer if
- *    the file was found but no acceptable &dentries were available, or a
- *    %ERR_PTR error code indicating why it couldn't be found (e.g. %ENOENT or
- *    %ENOMEM).
- *
- * encode_fh:
- *    @encode_fh should store in the file handle fragment @fh (using at most
- *    @max_len bytes) information that can be used by @decode_fh to recover the
- *    file refered to by the &struct dentry @de.  If the @connectable flag is
- *    set, the encode_fh() should store sufficient information so that a good
- *    attempt can be made to find not only the file but also it's place in the
- *    filesystem.   This typically means storing a reference to de->d_parent in
- *    the filehandle fragment.  encode_fh() should return the number of bytes
- *    stored or a negative error code such as %-ENOSPC
- *
- * get_name:
- *    @get_name should find a name for the given @child in the given @parent
- *    directory.  The name should be stored in the @name (with the
- *    understanding that it is already pointing to a a %NAME_MAX+1 sized
- *    buffer.   get_name() should return %0 on success, a negative error code
- *    or error.  @get_name will be called without @parent->i_mutex held.
- *
- * get_parent:
- *    @get_parent should find the parent directory for the given @child which
- *    is also a directory.  In the event that it cannot be found, or storage
- *    space cannot be allocated, a %ERR_PTR should be returned.
- *
- * get_dentry:
- *    Given a &super_block (@sb) and a pointer to a file-system specific inode
- *    identifier, possibly an inode number, (@inump) get_dentry() should find
- *    the identified inode and return a dentry for that inode.  Any suitable
- *    dentry can be returned including, if necessary, a new dentry created with
- *    d_alloc_root.  The caller can then find any other extant dentrys by
- *    following the d_alias links.  If a new dentry was created using
- *    d_alloc_root, DCACHE_NFSD_DISCONNECTED should be set, and the dentry
- *    should be d_rehash()ed.
- *
- *    If the inode cannot be found, either a %NULL pointer or an %ERR_PTR code
- *    can be returned.  The @inump will be whatever was passed to
- *    nfsd_find_fh_dentry() in either the @obj or @parent parameters.
- *
- * Locking rules:
- *    get_parent is called with child->d_inode->i_mutex down
- *    get_name is not (which is possibly inconsistent)
- */
-
-struct export_operations {
-	struct dentry *(*decode_fh)(struct super_block *sb, __u32 *fh, int fh_len, int fh_type,
-			 int (*acceptable)(void *context, struct dentry *de),
-			 void *context);
-	int (*encode_fh)(struct dentry *de, __u32 *fh, int *max_len,
-			 int connectable);
-
-	/* the following are only called from the filesystem itself */
-	int (*get_name)(struct dentry *parent, char *name,
-			struct dentry *child);
-	struct dentry * (*get_parent)(struct dentry *child);
-	struct dentry * (*get_dentry)(struct super_block *sb, void *inump);
-
-	/* This is set by the exporting module to a standard helper */
-	struct dentry * (*find_exported_dentry)(
-		struct super_block *sb, void *obj, void *parent,
-		int (*acceptable)(void *context, struct dentry *de),
-		void *context);
-
-
-};
-
-extern struct dentry *
-find_exported_dentry(struct super_block *sb, void *obj, void *parent,
-		     int (*acceptable)(void *context, struct dentry *de),
-		     void *context);
-
 struct file_system_type {
 	const char *name;
 	int fs_flags;
diff --git a/mm/shmem.c b/mm/shmem.c
index e49181d9d893..96fa79fb6ad3 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -27,6 +27,7 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/xattr.h>
+#include <linux/exportfs.h>
 #include <linux/generic_acl.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
-- 
cgit v1.3-8-gc7d7


From 5ca29607331fe37980dc3b488793ef8b1409b722 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 17 Jul 2007 04:04:29 -0700
Subject: knfsd: exportfs: remove iget abuse

When the exportfs interface was added the expectation was that filesystems
provide an operation to convert from a file handle to an inode/dentry, but it
kept a backwards compat option that still calls into iget.

Calling into iget from non-filesystem code is very bad, because it gives too
little information to filesystem, and simply crashes if the filesystem doesn't
implement the ->read_inode routine.

Fortunately there are only two filesystems left using this fallback: efs and
jfs.  This patch moves a copy of export_iget to each of those to implement the
get_dentry method.

While this is a temporary increase of lines of code in the kernel it allows
for a much cleaner interface and important code restructuring in later
patches.

[akpm@linux-foundation.org: add jfs_get_inode_flags() declaration]
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/efs/namei.c         | 32 +++++++++++++++++++++++++++++
 fs/efs/super.c         |  1 +
 fs/exportfs/expfs.c    | 56 +++-----------------------------------------------
 fs/jfs/jfs_inode.h     |  1 +
 fs/jfs/namei.c         | 32 +++++++++++++++++++++++++++++
 fs/jfs/super.c         |  1 +
 include/linux/efs_fs.h |  1 +
 7 files changed, 71 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index ed4a207fe22a..5276b19423c1 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -75,6 +75,38 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei
 	return NULL;
 }
 
+struct dentry *efs_get_dentry(struct super_block *sb, void *vobjp)
+{
+	__u32 *objp = vobjp;
+	unsigned long ino = objp[0];
+	__u32 generation = objp[1];
+	struct inode *inode;
+	struct dentry *result;
+
+	if (ino == 0)
+		return ERR_PTR(-ESTALE);
+	inode = iget(sb, ino);
+	if (inode == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	if (is_bad_inode(inode) ||
+	    (generation && inode->i_generation != generation)) {
+	    	result = ERR_PTR(-ESTALE);
+		goto out_iput;
+	}
+
+	result = d_alloc_anon(inode);
+	if (!result) {
+		result = ERR_PTR(-ENOMEM);
+		goto out_iput;
+	}
+	return result;
+
+ out_iput:
+	iput(inode);
+	return result;
+}
+
 struct dentry *efs_get_parent(struct dentry *child)
 {
 	struct dentry *parent;
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 67338d622f56..d360c81f3a72 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -114,6 +114,7 @@ static const struct super_operations efs_superblock_operations = {
 };
 
 static struct export_operations efs_export_ops = {
+	.get_dentry	= efs_get_dentry,
 	.get_parent	= efs_get_parent,
 };
 
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 16287af34859..dd132bb7b8f8 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -391,61 +391,11 @@ out:
 	return error;
 }
 
-
-static struct dentry *export_iget(struct super_block *sb, unsigned long ino, __u32 generation)
+static struct dentry *get_dentry(struct super_block *sb, void *vobjp)
 {
-
-	/* iget isn't really right if the inode is currently unallocated!!
-	 * This should really all be done inside each filesystem
-	 *
-	 * ext2fs' read_inode has been strengthed to return a bad_inode if
-	 * the inode had been deleted.
-	 *
-	 * Currently we don't know the generation for parent directory, so
-	 * a generation of 0 means "accept any"
-	 */
-	struct inode *inode;
-	struct dentry *result;
-	if (ino == 0)
-		return ERR_PTR(-ESTALE);
-	inode = iget(sb, ino);
-	if (inode == NULL)
-		return ERR_PTR(-ENOMEM);
-	if (is_bad_inode(inode)
-	    || (generation && inode->i_generation != generation)
-		) {
-		/* we didn't find the right inode.. */
-		dprintk("fh_verify: Inode %lu, Bad count: %d %d or version  %u %u\n",
-			inode->i_ino,
-			inode->i_nlink, atomic_read(&inode->i_count),
-			inode->i_generation,
-			generation);
-
-		iput(inode);
-		return ERR_PTR(-ESTALE);
-	}
-	/* now to find a dentry.
-	 * If possible, get a well-connected one
-	 */
-	result = d_alloc_anon(inode);
-	if (!result) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-	return result;
+	return ERR_PTR(-ESTALE);
 }
 
-
-static struct dentry *get_object(struct super_block *sb, void *vobjp)
-{
-	__u32 *objp = vobjp;
-	unsigned long ino = objp[0];
-	__u32 generation = objp[1];
-
-	return export_iget(sb, ino, generation);
-}
-
-
 /**
  * export_encode_fh - default export_operations->encode_fh function
  * @dentry:  the dentry to encode
@@ -524,7 +474,7 @@ struct export_operations export_op_default = {
 
 	.get_name	= get_name,
 	.get_parent	= get_parent,
-	.get_dentry	= get_object,
+	.get_dentry	= get_dentry,
 };
 
 EXPORT_SYMBOL(export_op_default);
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 2374b595f2e1..f0ec72b263f1 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -32,6 +32,7 @@ extern void jfs_truncate_nolock(struct inode *, loff_t);
 extern void jfs_free_zero_link(struct inode *);
 extern struct dentry *jfs_get_parent(struct dentry *dentry);
 extern void jfs_get_inode_flags(struct jfs_inode_info *);
+extern struct dentry *jfs_get_dentry(struct super_block *sb, void *vobjp);
 extern void jfs_set_inode_flags(struct inode *);
 extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 25161c4121e4..932797ba433b 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1477,6 +1477,38 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
 	return dentry;
 }
 
+struct dentry *jfs_get_dentry(struct super_block *sb, void *vobjp)
+{
+	__u32 *objp = vobjp;
+	unsigned long ino = objp[0];
+	__u32 generation = objp[1];
+	struct inode *inode;
+	struct dentry *result;
+
+	if (ino == 0)
+		return ERR_PTR(-ESTALE);
+	inode = iget(sb, ino);
+	if (inode == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	if (is_bad_inode(inode) ||
+	    (generation && inode->i_generation != generation)) {
+	    	result = ERR_PTR(-ESTALE);
+		goto out_iput;
+	}
+
+	result = d_alloc_anon(inode);
+	if (!result) {
+		result = ERR_PTR(-ENOMEM);
+		goto out_iput;
+	}
+	return result;
+
+ out_iput:
+	iput(inode);
+	return result;
+}
+
 struct dentry *jfs_get_parent(struct dentry *dentry)
 {
 	struct super_block *sb = dentry->d_inode->i_sb;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 49b652bd96ba..929fceca7999 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -738,6 +738,7 @@ static const struct super_operations jfs_super_operations = {
 };
 
 static struct export_operations jfs_export_operations = {
+	.get_dentry	= jfs_get_dentry,
 	.get_parent	= jfs_get_parent,
 };
 
diff --git a/include/linux/efs_fs.h b/include/linux/efs_fs.h
index dfed8009ebff..16cb25cbf7c5 100644
--- a/include/linux/efs_fs.h
+++ b/include/linux/efs_fs.h
@@ -45,6 +45,7 @@ extern efs_block_t efs_map_block(struct inode *, efs_block_t);
 extern int efs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 
 extern struct dentry *efs_lookup(struct inode *, struct dentry *, struct nameidata *);
+extern struct dentry *efs_get_dentry(struct super_block *sb, void *vobjp);
 extern struct dentry *efs_get_parent(struct dentry *);
 extern int efs_bmap(struct inode *, int);
 
-- 
cgit v1.3-8-gc7d7


From d37065cd6d6bbe98fd4be14d6c9e64c0bfa124c5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 17 Jul 2007 04:04:30 -0700
Subject: knfsd: exportfs: add procedural interface for NFSD

Currently NFSD calls directly into filesystems through the export_operations
structure.  I plan to change this interface in various ways in later patches,
and want to avoid the export of the default operations to NFSD, so this patch
adds two simple exportfs_encode_fh/exportfs_decode_fh helpers for NFSD to call
instead of poking into exportfs guts.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exportfs/expfs.c      | 22 +++++++++++++++++++++-
 fs/nfsd/nfsfh.c          | 18 +++++-------------
 include/linux/exportfs.h |  7 +++++++
 3 files changed, 33 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index dd132bb7b8f8..db86006956b0 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -3,6 +3,7 @@
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/module.h>
+#include <linux/mount.h>
 #include <linux/namei.h>
 
 struct export_operations export_op_default;
@@ -468,6 +469,26 @@ static struct dentry *export_decode_fh(struct super_block *sb, __u32 *fh, int fh
 				   acceptable, context);
 }
 
+int exportfs_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len,
+		int connectable)
+{
+	struct export_operations *nop = dentry->d_sb->s_export_op;
+
+	return CALL(nop, encode_fh)(dentry, fh, max_len, connectable);
+}
+EXPORT_SYMBOL_GPL(exportfs_encode_fh);
+
+struct dentry *exportfs_decode_fh(struct vfsmount *mnt, __u32 *fh, int fh_len,
+		int fileid_type, int (*acceptable)(void *, struct dentry *),
+		void *context)
+{
+	struct export_operations *nop = mnt->mnt_sb->s_export_op;
+
+	return CALL(nop, decode_fh)(mnt->mnt_sb, fh, fh_len, fileid_type,
+			acceptable, context);
+}
+EXPORT_SYMBOL_GPL(exportfs_decode_fh);
+
 struct export_operations export_op_default = {
 	.decode_fh	= export_decode_fh,
 	.encode_fh	= export_encode_fh,
@@ -477,7 +498,6 @@ struct export_operations export_op_default = {
 	.get_dentry	= get_dentry,
 };
 
-EXPORT_SYMBOL(export_op_default);
 EXPORT_SYMBOL(find_exported_dentry);
 
 MODULE_LICENSE("GPL");
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 0108d3ec1c27..ecf9c361b3f5 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -28,10 +28,6 @@
 static int nfsd_nr_verified;
 static int nfsd_nr_put;
 
-extern struct export_operations export_op_default;
-
-#define	CALL(ops,fun) ((ops->fun)?(ops->fun):export_op_default.fun)
-
 /*
  * our acceptability function.
  * if NOSUBTREECHECK, accept anything
@@ -212,11 +208,9 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
 		if (fileid_type == 0)
 			dentry = dget(exp->ex_dentry);
 		else {
-			struct export_operations *nop = exp->ex_mnt->mnt_sb->s_export_op;
-			dentry = CALL(nop,decode_fh)(exp->ex_mnt->mnt_sb,
-						     datap, data_left,
-						     fileid_type,
-						     nfsd_acceptable, exp);
+			dentry = exportfs_decode_fh(exp->ex_mnt, datap,
+					data_left, fileid_type,
+					nfsd_acceptable, exp);
 		}
 		if (dentry == NULL)
 			goto out;
@@ -287,15 +281,13 @@ out:
 static inline int _fh_update(struct dentry *dentry, struct svc_export *exp,
 			     __u32 *datap, int *maxsize)
 {
-	struct export_operations *nop = exp->ex_mnt->mnt_sb->s_export_op;
-
 	if (dentry == exp->ex_dentry) {
 		*maxsize = 0;
 		return 0;
 	}
 
-	return CALL(nop,encode_fh)(dentry, datap, maxsize,
-			  !(exp->ex_flags&NFSEXP_NOSUBTREECHECK));
+	return exportfs_encode_fh(dentry, datap, maxsize,
+			  !(exp->ex_flags & NFSEXP_NOSUBTREECHECK));
 }
 
 /*
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index fdc306fbba50..8872fe8392d6 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -5,6 +5,7 @@
 
 struct dentry;
 struct super_block;
+struct vfsmount;
 
 
 /**
@@ -116,4 +117,10 @@ extern struct dentry *find_exported_dentry(struct super_block *sb, void *obj,
 	void *parent, int (*acceptable)(void *context, struct dentry *de),
 	void *context);
 
+extern int exportfs_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len,
+	int connectable);
+extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, __u32 *fh,
+	int fh_len, int fileid_type, int (*acceptable)(void *, struct dentry *),
+	void *context);
+
 #endif /* LINUX_EXPORTFS_H */
-- 
cgit v1.3-8-gc7d7


From 9a8db97e7756119689c93c431e8b8324080f5625 Mon Sep 17 00:00:00 2001
From: Marc Eshel <eshel@almaden.ibm.com>
Date: Tue, 17 Jul 2007 04:04:35 -0700
Subject: knfsd: lockd: nfsd4: use same grace period for lockd and nfsd4

Both lockd and (in the nfsv4 case) nfsd enforce a "grace period" after reboot,
during which clients may reclaim locks from the previous server instance, but
may not acquire new locks.

Currently the lockd and nfsd enforce grace periods of different lengths.  This
may cause problems when we reboot a server with both v2/v3 and v4 clients.
For example, if the lockd grace period is shorter (as is likely the case),
then a v3 client might acquire a new lock that conflicts with a lock already
held (but not yet reclaimed) by a v4 client.

This patch calculates a lease time that lockd and nfsd can both use.

Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/lockd/svc.c             | 27 ++++++++++++++++++++-------
 fs/nfsd/lockd.c            |  1 +
 fs/nfsd/nfs4state.c        | 16 ++++++++++++----
 include/linux/lockd/bind.h |  9 +++++++++
 4 files changed, 42 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 9fcdef50aad6..82e2192a0d5c 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -76,18 +76,31 @@ static const int		nlm_port_min = 0, nlm_port_max = 65535;
 
 static struct ctl_table_header * nlm_sysctl_table;
 
-static unsigned long set_grace_period(void)
+static unsigned long get_lockd_grace_period(void)
 {
-	unsigned long grace_period;
-
 	/* Note: nlm_timeout should always be nonzero */
 	if (nlm_grace_period)
-		grace_period = ((nlm_grace_period + nlm_timeout - 1)
-				/ nlm_timeout) * nlm_timeout * HZ;
+		return roundup(nlm_grace_period, nlm_timeout) * HZ;
 	else
-		grace_period = nlm_timeout * 5 * HZ;
+		return nlm_timeout * 5 * HZ;
+}
+
+unsigned long get_nfs_grace_period(void)
+{
+	unsigned long lockdgrace = get_lockd_grace_period();
+	unsigned long nfsdgrace = 0;
+
+	if (nlmsvc_ops)
+		nfsdgrace = nlmsvc_ops->get_grace_period();
+
+	return max(lockdgrace, nfsdgrace);
+}
+EXPORT_SYMBOL(get_nfs_grace_period);
+
+static unsigned long set_grace_period(void)
+{
 	nlmsvc_grace_period = 1;
-	return grace_period + jiffies;
+	return get_nfs_grace_period() + jiffies;
 }
 
 static inline void clear_grace_period(void)
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 221acd1f11f6..9e4a568a5013 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -65,6 +65,7 @@ nlm_fclose(struct file *filp)
 static struct nlmsvc_binding	nfsd_nlm_ops = {
 	.fopen		= nlm_fopen,		/* open file for locking */
 	.fclose		= nlm_fclose,		/* close file */
+	.get_grace_period = get_nfs4_grace_period,
 };
 
 void
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8c52913d7cb6..9cc31eaf3857 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -51,6 +51,7 @@
 #include <linux/namei.h>
 #include <linux/mutex.h>
 #include <linux/lockd/bind.h>
+#include <linux/module.h>
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -3190,20 +3191,27 @@ nfsd4_load_reboot_recovery_data(void)
 		printk("NFSD: Failure reading reboot recovery data\n");
 }
 
+unsigned long
+get_nfs4_grace_period(void)
+{
+	return max(user_lease_time, lease_time) * HZ;
+}
+
 /* initialization to perform when the nfsd service is started: */
 
 static void
 __nfs4_state_start(void)
 {
-	time_t grace_time;
+	unsigned long grace_time;
 
 	boot_time = get_seconds();
-	grace_time = max(user_lease_time, lease_time);
+	grace_time = get_nfs_grace_period();
 	lease_time = user_lease_time;
 	in_grace = 1;
-	printk("NFSD: starting %ld-second grace period\n", grace_time);
+	printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
+	       grace_time/HZ);
 	laundry_wq = create_singlethread_workqueue("nfsd4");
-	queue_delayed_work(laundry_wq, &laundromat_work, grace_time*HZ);
+	queue_delayed_work(laundry_wq, &laundromat_work, grace_time);
 }
 
 int
diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h
index 246de1d84a26..6f1637c61e10 100644
--- a/include/linux/lockd/bind.h
+++ b/include/linux/lockd/bind.h
@@ -27,6 +27,7 @@ struct nlmsvc_binding {
 						struct nfs_fh *,
 						struct file **);
 	void			(*fclose)(struct file *);
+	unsigned long		(*get_grace_period)(void);
 };
 
 extern struct nlmsvc_binding *	nlmsvc_ops;
@@ -38,4 +39,12 @@ extern int	nlmclnt_proc(struct inode *, int, struct file_lock *);
 extern int	lockd_up(int proto);
 extern void	lockd_down(void);
 
+unsigned long get_nfs_grace_period(void);
+
+#ifdef CONFIG_NFSD_V4
+unsigned long get_nfs4_grace_period(void);
+#else
+static inline unsigned long get_nfs4_grace_period(void) {return 0;}
+#endif
+
 #endif /* LINUX_LOCKD_BIND_H */
-- 
cgit v1.3-8-gc7d7


From 33a1060ae7dc671a0208b341bd454009625bb5a6 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 17 Jul 2007 04:04:35 -0700
Subject: knfsd: nfsd4: fix NFSv4 filehandle size units confusion

NFS4_FHSIZE is measured in bytes, not 4-byte words, so much more space than
necessary is being allocated for struct nfs4_cb_recall.

I should have wondered why this structure was so much larger than it needed to
be!

Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nfsd/state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index ab5c236bd9a7..732de9cad4a8 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -67,7 +67,7 @@ struct nfs4_cb_recall {
 	int			cbr_trunc;
 	stateid_t		cbr_stateid;
 	u32			cbr_fhlen;
-	u32			cbr_fhval[NFS4_FHSIZE];
+	char			cbr_fhval[NFS4_FHSIZE];
 	struct nfs4_delegation	*cbr_dp;
 };
 
-- 
cgit v1.3-8-gc7d7


From 1e5140279f31e47d58ed6036ee61ba7a65710e63 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 17 Jul 2007 04:04:38 -0700
Subject: knfsd: nfsd: remove unused header interface.h

It looks like Al Viro gutted this header file five years ago and it hasn't
been touched since.

Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/nfsctl.c               |  1 -
 include/linux/nfsd/interface.h | 13 -------------
 include/linux/nfsd/nfsd.h      |  1 -
 3 files changed, 15 deletions(-)
 delete mode 100644 include/linux/nfsd/interface.h

(limited to 'include/linux')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 5ab80edc795b..baac89d917ca 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -35,7 +35,6 @@
 #include <linux/nfsd/cache.h>
 #include <linux/nfsd/xdr.h>
 #include <linux/nfsd/syscall.h>
-#include <linux/nfsd/interface.h>
 
 #include <asm/uaccess.h>
 
diff --git a/include/linux/nfsd/interface.h b/include/linux/nfsd/interface.h
deleted file mode 100644
index af0979704afb..000000000000
--- a/include/linux/nfsd/interface.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * include/linux/nfsd/interface.h
- *
- * defines interface between nfsd and other bits of
- * the kernel.  Particularly filesystems (eventually).
- *
- * Copyright (C) 2000 Neil Brown <neilb@cse.unsw.edu.au>
- */
-
-#ifndef LINUX_NFSD_INTERFACE_H
-#define LINUX_NFSD_INTERFACE_H
-
-#endif /* LINUX_NFSD_INTERFACE_H */
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 72feac581aa3..0d8420497765 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -22,7 +22,6 @@
 #include <linux/nfsd/export.h>
 #include <linux/nfsd/auth.h>
 #include <linux/nfsd/stats.h>
-#include <linux/nfsd/interface.h>
 /*
  * nfsd version
  */
-- 
cgit v1.3-8-gc7d7


From c2f1a551dea8b37c2e0cb886885c250fb703e9d8 Mon Sep 17 00:00:00 2001
From: Meelap Shah <meelap@umich.edu>
Date: Tue, 17 Jul 2007 04:04:39 -0700
Subject: knfsd: nfsd4: vary maximum delegation limit based on RAM size

Our original NFSv4 delegation policy was to give out a read delegation on any
open when it was possible to.

Since the lifetime of a delegation isn't limited to that of an open, a client
may quite reasonably hang on to a delegation as long as it has the inode
cached.  This becomes an obvious problem the first time a client's inode cache
approaches the size of the server's total memory.

Our first quick solution was to add a hard-coded limit.  This patch makes a
mild incremental improvement by varying that limit according to the server's
total memory size, allowing at most 4 delegations per megabyte of RAM.

My quick back-of-the-envelope calculation finds that in the worst case (where
every delegation is for a different inode), a delegation could take about
1.5K, which would make the worst case usage about 6% of memory.  The new limit
works out to be about the same as the old on a 1-gig server.

[akpm@linux-foundation.org: Don't needlessly bloat vmlinux]
[akpm@linux-foundation.org: Make it right for highmem machines]
Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/nfs4state.c       | 26 +++++++++++++++++++++++++-
 include/linux/nfsd/nfsd.h |  1 +
 mm/page_alloc.c           |  1 +
 3 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9cc31eaf3857..46249886ea86 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -49,6 +49,7 @@
 #include <linux/nfsd/state.h>
 #include <linux/nfsd/xdr4.h>
 #include <linux/namei.h>
+#include <linux/swap.h>
 #include <linux/mutex.h>
 #include <linux/lockd/bind.h>
 #include <linux/module.h>
@@ -150,6 +151,7 @@ get_nfs4_file(struct nfs4_file *fi)
 }
 
 static int num_delegations;
+unsigned int max_delegations;
 
 /*
  * Open owner state (share locks)
@@ -193,7 +195,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback;
 
 	dprintk("NFSD alloc_init_deleg\n");
-	if (num_delegations > STATEID_HASH_SIZE * 4)
+	if (num_delegations > max_delegations)
 		return NULL;
 	dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
 	if (dp == NULL)
@@ -3197,6 +3199,27 @@ get_nfs4_grace_period(void)
 	return max(user_lease_time, lease_time) * HZ;
 }
 
+/*
+ * Since the lifetime of a delegation isn't limited to that of an open, a
+ * client may quite reasonably hang on to a delegation as long as it has
+ * the inode cached.  This becomes an obvious problem the first time a
+ * client's inode cache approaches the size of the server's total memory.
+ *
+ * For now we avoid this problem by imposing a hard limit on the number
+ * of delegations, which varies according to the server's memory size.
+ */
+static void
+set_max_delegations(void)
+{
+	/*
+	 * Allow at most 4 delegations per megabyte of RAM.  Quick
+	 * estimates suggest that in the worst case (where every delegation
+	 * is for a different inode), a delegation could take about 1.5K,
+	 * giving a worst case usage of about 6% of memory.
+	 */
+	max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT);
+}
+
 /* initialization to perform when the nfsd service is started: */
 
 static void
@@ -3212,6 +3235,7 @@ __nfs4_state_start(void)
 	       grace_time/HZ);
 	laundry_wq = create_singlethread_workqueue("nfsd4");
 	queue_delayed_work(laundry_wq, &laundromat_work, grace_time);
+	set_max_delegations();
 }
 
 int
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 0d8420497765..ce5e345a9bce 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -148,6 +148,7 @@ extern int nfsd_max_blksize;
  * NFSv4 State
  */
 #ifdef CONFIG_NFSD_V4
+extern unsigned int max_delegations;
 void nfs4_state_init(void);
 int nfs4_state_start(void);
 void nfs4_state_shutdown(void);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1a889c3fec59..e2a10b957f23 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1484,6 +1484,7 @@ unsigned int nr_free_buffer_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_USER));
 }
+EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
 
 /*
  * Amount of free RAM allocatable within all zones
-- 
cgit v1.3-8-gc7d7


From 47f9940c55c0bdc65188749cae4e841601f513bb Mon Sep 17 00:00:00 2001
From: Meelap Shah <meelap@umich.edu>
Date: Tue, 17 Jul 2007 04:04:40 -0700
Subject: knfsd: nfsd4: don't delegate files that have had conflicts

One more incremental delegation policy improvement: don't give out a
delegation on a file if conflicting access has previously required that a
delegation be revoked on that file.  (In practice we'll forget about the
conflict when the struct nfs4_file is removed on close, so this is of limited
use for now, though it should at least solve a temporary problem with
self-conflicts on write opens from the same client.)

Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/nfs4state.c        | 4 ++++
 include/linux/nfsd/state.h | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 46249886ea86..e4a4c87ec8c6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -195,6 +195,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback;
 
 	dprintk("NFSD alloc_init_deleg\n");
+	if (fp->fi_had_conflict)
+		return NULL;
 	if (num_delegations > max_delegations)
 		return NULL;
 	dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
@@ -1002,6 +1004,7 @@ alloc_init_file(struct inode *ino)
 		list_add(&fp->fi_hash, &file_hashtbl[hashval]);
 		fp->fi_inode = igrab(ino);
 		fp->fi_id = current_fileid++;
+		fp->fi_had_conflict = false;
 		return fp;
 	}
 	return NULL;
@@ -1328,6 +1331,7 @@ do_recall(void *__dp)
 {
 	struct nfs4_delegation *dp = __dp;
 
+	dp->dl_file->fi_had_conflict = true;
 	nfsd4_cb_recall(dp);
 	return 0;
 }
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 732de9cad4a8..db348f749376 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -224,6 +224,7 @@ struct nfs4_file {
 	struct inode		*fi_inode;
 	u32                     fi_id;      /* used with stateowner->so_id 
 					     * for stateid_hashtbl hash */
+	bool			fi_had_conflict;
 };
 
 /*
-- 
cgit v1.3-8-gc7d7


From c4170583f655dca5da32bd14173d6a93805fc48b Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@citi.umich.edu>
Date: Tue, 17 Jul 2007 04:04:42 -0700
Subject: knfsd: nfsd4: store pseudoflavor in request

Add a new field to the svc_rqst structure to record the pseudoflavor that the
request was made with.  For now we record the pseudoflavor but don't use it
for anything.

Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sunrpc/gss_api.h        |  1 +
 include/linux/sunrpc/svc.h            |  1 +
 net/sunrpc/auth_gss/gss_mech_switch.c | 14 ++++++++++++++
 net/sunrpc/auth_gss/svcauth_gss.c     |  2 ++
 net/sunrpc/svcauth_unix.c             |  3 +++
 5 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h
index bbac101ac372..459c5fc11d51 100644
--- a/include/linux/sunrpc/gss_api.h
+++ b/include/linux/sunrpc/gss_api.h
@@ -58,6 +58,7 @@ u32 gss_unwrap(
 u32 gss_delete_sec_context(
 		struct gss_ctx		**ctx_id);
 
+u32 gss_svc_to_pseudoflavor(struct gss_api_mech *, u32 service);
 u32 gss_pseudoflavor_to_service(struct gss_api_mech *, u32 pseudoflavor);
 char *gss_service_to_auth_domain_name(struct gss_api_mech *, u32 service);
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 129d50f2225c..705a90aa345e 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -212,6 +212,7 @@ struct svc_rqst {
 	struct svc_pool *	rq_pool;	/* thread pool */
 	struct svc_procedure *	rq_procinfo;	/* procedure info */
 	struct auth_ops *	rq_authop;	/* authentication flavour */
+	u32			rq_flavor;	/* pseudoflavor */
 	struct svc_cred		rq_cred;	/* auth info */
 	struct sk_buff *	rq_skbuff;	/* fast recv inet buffer */
 	struct svc_deferred_req*rq_deferred;	/* deferred request we are replaying */
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 26872517ccf3..61801a069ff0 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -193,6 +193,20 @@ gss_mech_get_by_pseudoflavor(u32 pseudoflavor)
 
 EXPORT_SYMBOL(gss_mech_get_by_pseudoflavor);
 
+u32
+gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 service)
+{
+	int i;
+
+	for (i = 0; i < gm->gm_pf_num; i++) {
+		if (gm->gm_pfs[i].service == service) {
+			return gm->gm_pfs[i].pseudoflavor;
+		}
+	}
+	return RPC_AUTH_MAXFLAVOR; /* illegal value */
+}
+EXPORT_SYMBOL(gss_svc_to_pseudoflavor);
+
 u32
 gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor)
 {
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index c094583386fd..7a3e1758bea1 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1131,6 +1131,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
 		}
 		svcdata->rsci = rsci;
 		cache_get(&rsci->h);
+		rqstp->rq_flavor = gss_svc_to_pseudoflavor(
+					rsci->mechctx->mech_type, gc->gc_svc);
 		ret = SVC_OK;
 		goto out;
 	}
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 07dcd20cbee4..d9fdf2e4d242 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -5,6 +5,7 @@
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/sunrpc/svcauth.h>
+#include <linux/sunrpc/gss_api.h>
 #include <linux/err.h>
 #include <linux/seq_file.h>
 #include <linux/hash.h>
@@ -707,6 +708,7 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
 	svc_putnl(resv, RPC_AUTH_NULL);
 	svc_putnl(resv, 0);
 
+	rqstp->rq_flavor = RPC_AUTH_NULL;
 	return SVC_OK;
 }
 
@@ -784,6 +786,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
 	svc_putnl(resv, RPC_AUTH_NULL);
 	svc_putnl(resv, 0);
 
+	rqstp->rq_flavor = RPC_AUTH_UNIX;
 	return SVC_OK;
 
 badcred:
-- 
cgit v1.3-8-gc7d7


From e677bfe4d451f8271986a229270c6eecd1f62b3f Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@citi.umich.edu>
Date: Tue, 17 Jul 2007 04:04:42 -0700
Subject: knfsd: nfsd4: parse secinfo information in exports downcall

We add a list of pseudoflavors to each export downcall, which will be used
both as a list of security flavors allowed on that export, and (in the order
given) as the list of pseudoflavors to return on secinfo calls.

This patch parses the new downcall information and adds it to the export
structure, but doesn't use it for anything yet.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/export.c            | 56 +++++++++++++++++++++++++++++++++++++++++++--
 include/linux/nfsd/export.h | 17 ++++++++++++++
 2 files changed, 71 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index d4accdcb53a2..fbbbcc5a2fa3 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -33,6 +33,8 @@
 #include <linux/nfsd/nfsfh.h>
 #include <linux/nfsd/syscall.h>
 #include <linux/lockd/bind.h>
+#include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/gss_api.h>
 
 #define NFSDDBG_FACILITY	NFSDDBG_EXPORT
 
@@ -452,8 +454,48 @@ out_free_all:
 	return err;
 }
 
+static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp)
+{
+	int listsize, err;
+	struct exp_flavor_info *f;
+
+	err = get_int(mesg, &listsize);
+	if (err)
+		return err;
+	if (listsize < 0 || listsize > MAX_SECINFO_LIST)
+		return -EINVAL;
+
+	for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) {
+		err = get_int(mesg, &f->pseudoflavor);
+		if (err)
+			return err;
+		/*
+		 * Just a quick sanity check; we could also try to check
+		 * whether this pseudoflavor is supported, but at worst
+		 * an unsupported pseudoflavor on the export would just
+		 * be a pseudoflavor that won't match the flavor of any
+		 * authenticated request.  The administrator will
+		 * probably discover the problem when someone fails to
+		 * authenticate.
+		 */
+		if (f->pseudoflavor < 0)
+			return -EINVAL;
+		err = get_int(mesg, &f->flags);
+		if (err)
+			return err;
+		/* Only some flags are allowed to differ between flavors: */
+		if (~NFSEXP_SECINFO_FLAGS & (f->flags ^ exp->ex_flags))
+			return -EINVAL;
+	}
+	exp->ex_nflavors = listsize;
+	return 0;
+}
+
 #else /* CONFIG_NFSD_V4 */
-static inline int fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) { return 0; }
+static inline int
+fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc){return 0;}
+static inline int
+secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; }
 #endif
 
 static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
@@ -477,6 +519,9 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 
 	exp.ex_uuid = NULL;
 
+	/* secinfo */
+	exp.ex_nflavors = 0;
+
 	if (mesg[mlen-1] != '\n')
 		return -EINVAL;
 	mesg[mlen-1] = 0;
@@ -554,7 +599,9 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 					if (exp.ex_uuid == NULL)
 						err = -ENOMEM;
 				}
-			} else
+			} else if (strcmp(buf, "secinfo") == 0)
+				err = secinfo_parse(&mesg, buf, &exp);
+			else
 				/* quietly ignore unknown words and anything
 				 * following. Newer user-space can try to set
 				 * new values, then see what the result was.
@@ -655,6 +702,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
 {
 	struct svc_export *new = container_of(cnew, struct svc_export, h);
 	struct svc_export *item = container_of(citem, struct svc_export, h);
+	int i;
 
 	new->ex_flags = item->ex_flags;
 	new->ex_anon_uid = item->ex_anon_uid;
@@ -670,6 +718,10 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
 	item->ex_fslocs.locations_count = 0;
 	new->ex_fslocs.migrated = item->ex_fslocs.migrated;
 	item->ex_fslocs.migrated = 0;
+	new->ex_nflavors = item->ex_nflavors;
+	for (i = 0; i < MAX_SECINFO_LIST; i++) {
+		new->ex_flavors[i] = item->ex_flavors[i];
+	}
 }
 
 static struct cache_head *svc_export_alloc(void)
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index 9f62d6182d32..736f0eafcedf 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -42,6 +42,8 @@
 #define	NFSEXP_NOACL		0x8000	/* reserved for possible ACL related use */
 #define NFSEXP_ALLFLAGS		0xFE3F
 
+/* The flags that may vary depending on security flavor: */
+#define NFSEXP_SECINFO_FLAGS	0
 
 #ifdef __KERNEL__
 
@@ -64,6 +66,19 @@ struct nfsd4_fs_locations {
 	int migrated;
 };
 
+/*
+ * We keep an array of pseudoflavors with the export, in order from most
+ * to least preferred.  For the forseeable future, we don't expect more
+ * than the eight pseudoflavors null, unix, krb5, krb5i, krb5p, skpm3,
+ * spkm3i, and spkm3p (and using all 8 at once should be rare).
+ */
+#define MAX_SECINFO_LIST	8
+
+struct exp_flavor_info {
+	u32	pseudoflavor;
+	u32	flags;
+};
+
 struct svc_export {
 	struct cache_head	h;
 	struct auth_domain *	ex_client;
@@ -76,6 +91,8 @@ struct svc_export {
 	int			ex_fsid;
 	unsigned char *		ex_uuid; /* 16 byte fsid */
 	struct nfsd4_fs_locations ex_fslocs;
+	int			ex_nflavors;
+	struct exp_flavor_info	ex_flavors[MAX_SECINFO_LIST];
 };
 
 /* an "export key" (expkey) maps a filehandlefragement to an
-- 
cgit v1.3-8-gc7d7


From df547efb03e3e8f9ea726e1d07fbbd6fd0706cd7 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 17 Jul 2007 04:04:43 -0700
Subject: knfsd: nfsd4: simplify exp_pseudoroot arguments

We're passing three arguments to exp_pseudoroot, two of which are just fields
of the svc_rqst.  Soon we'll want to pass in a third field as well.  So let's
just give up and pass in the whole struct svc_rqst.

Also sneak in some minor style cleanups while we're at it.

Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/export.c            | 5 ++---
 fs/nfsd/nfs4proc.c          | 7 +++----
 fs/nfsd/nfs4xdr.c           | 2 +-
 include/linux/nfsd/export.h | 2 +-
 4 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index fbbbcc5a2fa3..af6abb2529c9 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1235,8 +1235,7 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
  * export point with fsid==0
  */
 __be32
-exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp,
-	       struct cache_req *creq)
+exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
 {
 	struct svc_export *exp;
 	__be32 rv;
@@ -1244,7 +1243,7 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp,
 
 	mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
 
-	exp = exp_find(clp, FSID_NUM, fsidv, creq);
+	exp = exp_find(rqstp->rq_client, FSID_NUM, fsidv, rqstp->rq_chandle);
 	if (PTR_ERR(exp) == -ENOENT)
 		return nfserr_perm;
 	if (IS_ERR(exp))
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8522729830db..a106e3be7c13 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -286,8 +286,7 @@ nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	__be32 status;
 
 	fh_put(&cstate->current_fh);
-	status = exp_pseudoroot(rqstp->rq_client, &cstate->current_fh,
-			      &rqstp->rq_chandle);
+	status = exp_pseudoroot(rqstp, &cstate->current_fh);
 	return status;
 }
 
@@ -474,8 +473,8 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	__be32 ret;
 
 	fh_init(&tmp_fh, NFS4_FHSIZE);
-	if((ret = exp_pseudoroot(rqstp->rq_client, &tmp_fh,
-			      &rqstp->rq_chandle)) != 0)
+	ret = exp_pseudoroot(rqstp, &tmp_fh);
+	if (ret)
 		return ret;
 	if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) {
 		fh_put(&tmp_fh);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 15809dfd88a5..b0bfbda375e1 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1296,7 +1296,7 @@ static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *
 	char *path, *rootpath;
 
 	fh_init(&tmp_fh, NFS4_FHSIZE);
-	*stat = exp_pseudoroot(rqstp->rq_client, &tmp_fh, &rqstp->rq_chandle);
+	*stat = exp_pseudoroot(rqstp, &tmp_fh);
 	if (*stat)
 		return NULL;
 	rootpath = tmp_fh.fh_export->ex_path;
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index 736f0eafcedf..5ed4f277eeac 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -135,7 +135,7 @@ struct svc_export *	exp_parent(struct auth_domain *clp,
 				   struct cache_req *reqp);
 int			exp_rootfh(struct auth_domain *, 
 					char *path, struct knfsd_fh *, int maxsize);
-__be32			exp_pseudoroot(struct auth_domain *, struct svc_fh *fhp, struct cache_req *creq);
+__be32			exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
 __be32			nfserrno(int errno);
 
 extern struct cache_detail svc_export_cache;
-- 
cgit v1.3-8-gc7d7


From 0989a7889695831e49e2c53c1884f52645516a90 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 17 Jul 2007 04:04:44 -0700
Subject: knfsd: nfsd: provide export lookup wrappers which take a svc_rqst

Split the callers of exp_get_by_name(), exp_find(), and exp_parent() into
those that are processing requests and those that are doing other stuff (like
looking up filehandles for mountd).

No change in behavior, just a (fairly pointless, on its own) cleanup.

(Note this has the effect of making nfsd_cross_mnt() pass rqstp->rq_client
instead of exp->ex_client into exp_find_by_name().  However, the two should
have the same value at this point.)

Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/export.c            | 28 +++++++++++++++++++++++++++-
 fs/nfsd/nfsfh.c             |  5 ++---
 fs/nfsd/vfs.c               |  5 ++---
 include/linux/nfsd/export.h |  7 +++++++
 4 files changed, 38 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index af6abb2529c9..9b569af695ab 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1228,6 +1228,32 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
 	return exp;
 }
 
+/*
+ * Called from functions that handle requests; functions that do work on
+ * behalf of mountd are passed a single client name to use, and should
+ * use exp_get_by_name() or exp_find().
+ */
+struct svc_export *
+rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
+		struct dentry *dentry)
+{
+	return exp_get_by_name(rqstp->rq_client, mnt, dentry,
+						&rqstp->rq_chandle);
+}
+
+struct svc_export *
+rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv)
+{
+	return exp_find(rqstp->rq_client, fsid_type, fsidv,
+						&rqstp->rq_chandle);
+}
+
+struct svc_export *
+rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt,
+		struct dentry *dentry)
+{
+	return exp_parent(rqstp->rq_client, mnt, dentry, &rqstp->rq_chandle);
+}
 
 /*
  * Called when we need the filehandle for the root of the pseudofs,
@@ -1243,7 +1269,7 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
 
 	mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
 
-	exp = exp_find(rqstp->rq_client, FSID_NUM, fsidv, rqstp->rq_chandle);
+	exp = rqst_exp_find(rqstp, FSID_NUM, fsidv);
 	if (PTR_ERR(exp) == -ENOENT)
 		return nfserr_perm;
 	if (IS_ERR(exp))
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 89f9041a7782..180e068ea064 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -145,7 +145,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
 				fh->fh_fsid[1] = fh->fh_fsid[2];
 			}
 			if ((data_left -= len)<0) goto out;
-			exp = exp_find(rqstp->rq_client, fh->fh_fsid_type, datap, &rqstp->rq_chandle);
+			exp = rqst_exp_find(rqstp, fh->fh_fsid_type, datap);
 			datap += len;
 		} else {
 			dev_t xdev;
@@ -156,8 +156,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
 			xdev = old_decode_dev(fh->ofh_xdev);
 			xino = u32_to_ino_t(fh->ofh_xino);
 			mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL);
-			exp = exp_find(rqstp->rq_client, FSID_DEV, tfh,
-				       &rqstp->rq_chandle);
+			exp = rqst_exp_find(rqstp, FSID_DEV, tfh);
 		}
 
 		error = nfserr_stale;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ec6aaf8b0e36..65043af232ee 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -113,7 +113,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 
 	while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
 
-	exp2 = exp_get_by_name(exp->ex_client, mnt, mounts, &rqstp->rq_chandle);
+	exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts);
 	if (IS_ERR(exp2)) {
 		err = PTR_ERR(exp2);
 		dput(mounts);
@@ -188,8 +188,7 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
 			dput(dentry);
 			dentry = dp;
 
-			exp2 = exp_parent(exp->ex_client, mnt, dentry,
-					  &rqstp->rq_chandle);
+			exp2 = rqst_exp_parent(rqstp, mnt, dentry);
 			if (PTR_ERR(exp2) == -ENOENT) {
 				dput(dentry);
 				dentry = dget(dparent);
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index 5ed4f277eeac..1ba53e524749 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -129,10 +129,16 @@ struct svc_export *	exp_get_by_name(struct auth_domain *clp,
 					struct vfsmount *mnt,
 					struct dentry *dentry,
 					struct cache_req *reqp);
+struct svc_export *	rqst_exp_get_by_name(struct svc_rqst *,
+					     struct vfsmount *,
+					     struct dentry *);
 struct svc_export *	exp_parent(struct auth_domain *clp,
 				   struct vfsmount *mnt,
 				   struct dentry *dentry,
 				   struct cache_req *reqp);
+struct svc_export *	rqst_exp_parent(struct svc_rqst *,
+					struct vfsmount *mnt,
+					struct dentry *dentry);
 int			exp_rootfh(struct auth_domain *, 
 					char *path, struct knfsd_fh *, int maxsize);
 __be32			exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
@@ -152,6 +158,7 @@ static inline void exp_get(struct svc_export *exp)
 extern struct svc_export *
 exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
 	 struct cache_req *reqp);
+struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *);
 
 #endif /* __KERNEL__ */
 
-- 
cgit v1.3-8-gc7d7


From 3ab4d8b1215d61736e2a9a26bea7cc2e6b029e3d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 17 Jul 2007 04:04:46 -0700
Subject: knfsd: nfsd: set rq_client to ip-address-determined-domain

We want it to be possible for users to restrict exports both by IP address and
by pseudoflavor.  The pseudoflavor information has previously been passed
using special auth_domains stored in the rq_client field.  After the preceding
patch that stored the pseudoflavor in rq_pflavor, that's now superfluous; so
now we use rq_client for the ip information, as auth_null and auth_unix do.

However, we keep around the special auth_domain in the rq_gssclient field for
backwards compatibility purposes, so we can still do upcalls using the old
"gss/pseudoflavor" auth_domain if upcalls using the unix domain to give us an
appropriate export.  This allows us to continue supporting old mountd.

In fact, for this first patch, we always use the "gss/pseudoflavor"
auth_domain (and only it) if it is available; thus rq_client is ignored in the
auth_gss case, and this patch on its own makes no change in behavior; that
will be left to later patches.

Note on idmap: I'm almost tempted to just replace the auth_domain in the idmap
upcall by a dummy value--no version of idmapd has ever used it, and it's
unlikely anyone really wants to perform idmapping differently depending on the
where the client is (they may want to perform *credential* mapping
differently, but that's a different matter--the idmapper just handles id's
used in getattr and setattr).  But I'm updating the idmapd code anyway, just
out of general backwards-compatibility paranoia.

Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/export.c                  | 15 +++++++++++----
 fs/nfsd/nfs4idmap.c               | 13 +++++++++++--
 fs/nfsd/nfsfh.c                   |  2 --
 include/linux/sunrpc/svc.h        |  1 +
 include/linux/sunrpc/svcauth.h    |  1 +
 net/sunrpc/auth_gss/svcauth_gss.c | 21 ++++++++++++++++++---
 net/sunrpc/svcauth_unix.c         |  4 +++-
 7 files changed, 45 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 9b569af695ab..ac225a79376c 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1237,21 +1237,28 @@ struct svc_export *
 rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
 		struct dentry *dentry)
 {
-	return exp_get_by_name(rqstp->rq_client, mnt, dentry,
-						&rqstp->rq_chandle);
+	struct auth_domain *clp;
+
+	clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client;
+	return exp_get_by_name(clp, mnt, dentry, &rqstp->rq_chandle);
 }
 
 struct svc_export *
 rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv)
 {
-	return exp_find(rqstp->rq_client, fsid_type, fsidv,
-						&rqstp->rq_chandle);
+	struct auth_domain *clp;
+
+	clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client;
+	return exp_find(clp, fsid_type, fsidv, &rqstp->rq_chandle);
 }
 
 struct svc_export *
 rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt,
 		struct dentry *dentry)
 {
+	struct auth_domain *clp;
+
+	clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client;
 	return exp_parent(rqstp->rq_client, mnt, dentry, &rqstp->rq_chandle);
 }
 
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 45aa21ce6784..2cf9a9a2d89c 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -587,6 +587,15 @@ idmap_lookup(struct svc_rqst *rqstp,
 	return ret;
 }
 
+static char *
+rqst_authname(struct svc_rqst *rqstp)
+{
+	struct auth_domain *clp;
+
+	clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client;
+	return clp->name;
+}
+
 static int
 idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
 		uid_t *id)
@@ -600,7 +609,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
 		return -EINVAL;
 	memcpy(key.name, name, namelen);
 	key.name[namelen] = '\0';
-	strlcpy(key.authname, rqstp->rq_client->name, sizeof(key.authname));
+	strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
 	ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item);
 	if (ret == -ENOENT)
 		ret = -ESRCH; /* nfserr_badname */
@@ -620,7 +629,7 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
 	};
 	int ret;
 
-	strlcpy(key.authname, rqstp->rq_client->name, sizeof(key.authname));
+	strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
 	ret = idmap_lookup(rqstp, idtoname_lookup, &key, &idtoname_cache, &item);
 	if (ret == -ENOENT)
 		return sprintf(name, "%u", id);
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 180e068ea064..22cb5be79ad0 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -120,8 +120,6 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
 		int data_left = fh->fh_size/4;
 
 		error = nfserr_stale;
-		if (rqstp->rq_client == NULL)
-			goto out;
 		if (rqstp->rq_vers > 2)
 			error = nfserr_badhandle;
 		if (rqstp->rq_vers == 4 && fh->fh_size == 0)
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 705a90aa345e..8531a70da73d 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -249,6 +249,7 @@ struct svc_rqst {
 						 */
 	/* Catering to nfsd */
 	struct auth_domain *	rq_client;	/* RPC peer info */
+	struct auth_domain *	rq_gssclient;	/* "gss/"-style peer info */
 	struct svc_cacherep *	rq_cacherep;	/* cache info */
 	struct knfsd_fh *	rq_reffh;	/* Referrence filehandle, used to
 						 * determine what device number
diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h
index de92619b0826..22e1ef8e200e 100644
--- a/include/linux/sunrpc/svcauth.h
+++ b/include/linux/sunrpc/svcauth.h
@@ -127,6 +127,7 @@ extern struct auth_domain *auth_unix_lookup(struct in_addr addr);
 extern int auth_unix_forget_old(struct auth_domain *dom);
 extern void svcauth_unix_purge(void);
 extern void svcauth_unix_info_release(void *);
+extern int svcauth_unix_set_client(struct svc_rqst *rqstp);
 
 static inline unsigned long hash_str(char *name, int bits)
 {
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 7a3e1758bea1..e4b3de08b040 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -913,10 +913,23 @@ svcauth_gss_set_client(struct svc_rqst *rqstp)
 	struct gss_svc_data *svcdata = rqstp->rq_auth_data;
 	struct rsc *rsci = svcdata->rsci;
 	struct rpc_gss_wire_cred *gc = &svcdata->clcred;
+	int stat;
 
-	rqstp->rq_client = find_gss_auth_domain(rsci->mechctx, gc->gc_svc);
-	if (rqstp->rq_client == NULL)
+	/*
+	 * A gss export can be specified either by:
+	 * 	export	*(sec=krb5,rw)
+	 * or by
+	 * 	export gss/krb5(rw)
+	 * The latter is deprecated; but for backwards compatibility reasons
+	 * the nfsd code will still fall back on trying it if the former
+	 * doesn't work; so we try to make both available to nfsd, below.
+	 */
+	rqstp->rq_gssclient = find_gss_auth_domain(rsci->mechctx, gc->gc_svc);
+	if (rqstp->rq_gssclient == NULL)
 		return SVC_DENIED;
+	stat = svcauth_unix_set_client(rqstp);
+	if (stat == SVC_DROP)
+		return stat;
 	return SVC_OK;
 }
 
@@ -1088,7 +1101,6 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
 			svc_putnl(resv, GSS_SEQ_WIN);
 			if (svc_safe_putnetobj(resv, &rsip->out_token))
 				goto drop;
-			rqstp->rq_client = NULL;
 		}
 		goto complete;
 	case RPC_GSS_PROC_DESTROY:
@@ -1319,6 +1331,9 @@ out_err:
 	if (rqstp->rq_client)
 		auth_domain_put(rqstp->rq_client);
 	rqstp->rq_client = NULL;
+	if (rqstp->rq_gssclient)
+		auth_domain_put(rqstp->rq_gssclient);
+	rqstp->rq_gssclient = NULL;
 	if (rqstp->rq_cred.cr_group_info)
 		put_group_info(rqstp->rq_cred.cr_group_info);
 	rqstp->rq_cred.cr_group_info = NULL;
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index d9fdf2e4d242..411479411b21 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -638,7 +638,7 @@ static int unix_gid_find(uid_t uid, struct group_info **gip,
 	}
 }
 
-static int
+int
 svcauth_unix_set_client(struct svc_rqst *rqstp)
 {
 	struct sockaddr_in *sin = svc_addr_in(rqstp);
@@ -673,6 +673,8 @@ svcauth_unix_set_client(struct svc_rqst *rqstp)
 	return SVC_OK;
 }
 
+EXPORT_SYMBOL(svcauth_unix_set_client);
+
 static int
 svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
 {
-- 
cgit v1.3-8-gc7d7


From 32c1eb0cd7ee00b5eb7b6f7059c635fbc1052966 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@citi.umich.edu>
Date: Tue, 17 Jul 2007 04:04:48 -0700
Subject: knfsd: nfsd4: return nfserr_wrongsec

Make the first actual use of the secinfo information by using it to return
nfserr_wrongsec when an export is found that doesn't allow the flavor used on
this request.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/export.c            | 26 ++++++++++++++++++++++++++
 fs/nfsd/nfsfh.c             |  6 ++++++
 fs/nfsd/nfssvc.c            | 10 ++++++++++
 fs/nfsd/vfs.c               |  4 ++++
 include/linux/nfsd/export.h |  1 +
 include/linux/nfsd/nfsd.h   |  1 +
 6 files changed, 48 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 4537a8f5cb9a..323cbdcc9bfd 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1228,6 +1228,28 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
 	return exp;
 }
 
+__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
+{
+	struct exp_flavor_info *f;
+	struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+
+	/* legacy gss-only clients are always OK: */
+	if (exp->ex_client == rqstp->rq_gssclient)
+		return 0;
+	/* ip-address based client; check sec= export option: */
+	for (f = exp->ex_flavors; f < end; f++) {
+		if (f->pseudoflavor == rqstp->rq_flavor)
+			return 0;
+	}
+	/* defaults in absence of sec= options: */
+	if (exp->ex_nflavors == 0) {
+		if (rqstp->rq_flavor == RPC_AUTH_NULL ||
+		    rqstp->rq_flavor == RPC_AUTH_UNIX)
+			return 0;
+	}
+	return nfserr_wrongsec;
+}
+
 /*
  * Uses rq_client and rq_gssclient to find an export; uses rq_client (an
  * auth_unix client) if it's available and has secinfo information;
@@ -1340,6 +1362,10 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
 	if (IS_ERR(exp))
 		return nfserrno(PTR_ERR(exp));
 	rv = fh_compose(fhp, exp, exp->ex_dentry, NULL);
+	if (rv)
+		goto out;
+	rv = check_nfsd_access(exp, rqstp);
+out:
 	exp_put(exp);
 	return rv;
 }
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 22cb5be79ad0..d5fe392b14fb 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -20,6 +20,7 @@
 
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svcauth_gss.h>
 #include <linux/nfsd/nfsd.h>
 
 #define NFSDDBG_FACILITY		NFSDDBG_FH
@@ -248,6 +249,11 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
 	if (error)
 		goto out;
 
+	/* Check security flavor */
+	error = check_nfsd_access(exp, rqstp);
+	if (error)
+		goto out;
+
 	/* Finally, check access permissions. */
 	error = nfsd_permission(exp, dentry, access);
 
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 5c8192bcbced..a8c89ae4c743 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -494,6 +494,15 @@ out:
 	module_put_and_exit(0);
 }
 
+static __be32 map_new_errors(u32 vers, __be32 nfserr)
+{
+	if (nfserr == nfserr_jukebox && vers == 2)
+		return nfserr_dropit;
+	if (nfserr == nfserr_wrongsec && vers < 4)
+		return nfserr_acces;
+	return nfserr;
+}
+
 int
 nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 {
@@ -536,6 +545,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 
 	/* Now call the procedure handler, and encode NFS status. */
 	nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
+	nfserr = map_new_errors(rqstp->rq_vers, nfserr);
 	if (nfserr == nfserr_jukebox && rqstp->rq_vers == 2)
 		nfserr = nfserr_dropit;
 	if (nfserr == nfserr_dropit) {
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 627f460a4007..8e109e586a74 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -240,6 +240,9 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
 	err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry);
 	if (err)
 		return err;
+	err = check_nfsd_access(exp, rqstp);
+	if (err)
+		goto out;
 	/*
 	 * Note: we compose the file handle now, but as the
 	 * dentry may be negative, it may need to be updated.
@@ -247,6 +250,7 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
 	err = fh_compose(resfh, exp, dentry, fhp);
 	if (!err && !dentry->d_inode)
 		err = nfserr_noent;
+out:
 	dput(dentry);
 	exp_put(exp);
 	return err;
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index 1ba53e524749..424be41130ba 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -116,6 +116,7 @@ struct svc_expkey {
 #define EX_NOHIDE(exp)		((exp)->ex_flags & NFSEXP_NOHIDE)
 #define EX_WGATHER(exp)		((exp)->ex_flags & NFSEXP_GATHERED_WRITES)
 
+__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp);
 
 /*
  * Function declarations
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index ce5e345a9bce..62499c2f0918 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -236,6 +236,7 @@ void		nfsd_lockd_shutdown(void);
 #define	nfserr_badname		__constant_htonl(NFSERR_BADNAME)
 #define	nfserr_cb_path_down	__constant_htonl(NFSERR_CB_PATH_DOWN)
 #define	nfserr_locked		__constant_htonl(NFSERR_LOCKED)
+#define	nfserr_wrongsec		__constant_htonl(NFSERR_WRONGSEC)
 #define	nfserr_replay_me	__constant_htonl(NFSERR_REPLAY_ME)
 
 /* error codes for internal use */
-- 
cgit v1.3-8-gc7d7


From 0ec757df9743025f14190d6034d8bd2bf37c2dd1 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 17 Jul 2007 04:04:48 -0700
Subject: knfsd: nfsd4: make readonly access depend on pseudoflavor

Allow readonly access to vary depending on the pseudoflavor, using the flag
passed with each pseudoflavor in the export downcall.  The rest of the flags
are ignored for now, though some day we might also allow id squashing to vary
based on the flavor.

Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/nfsfh.c             |  2 +-
 fs/nfsd/nfsproc.c           |  3 ++-
 fs/nfsd/vfs.c               | 13 +++++++------
 include/linux/nfsd/export.h | 13 ++++++++++++-
 include/linux/nfsd/nfsd.h   |  3 ++-
 5 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index d5fe392b14fb..8d2b49914843 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -255,7 +255,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
 		goto out;
 
 	/* Finally, check access permissions. */
-	error = nfsd_permission(exp, dentry, access);
+	error = nfsd_permission(rqstp, exp, dentry, access);
 
 	if (error) {
 		dprintk("fh_verify: %s/%s permission failure, "
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index b2c7147aa921..977a71f64e19 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -278,7 +278,8 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
 					 *   echo thing > device-special-file-or-pipe
 					 * by doing a CREATE with type==0
 					 */
-					nfserr = nfsd_permission(newfhp->fh_export,
+					nfserr = nfsd_permission(rqstp,
+								 newfhp->fh_export,
 								 newfhp->fh_dentry,
 								 MAY_WRITE|MAY_LOCAL_ACCESS);
 					if (nfserr && nfserr != nfserr_rofs)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 8e109e586a74..e90f4a8a1d01 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -328,7 +328,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 	/* The size case is special. It changes the file as well as the attributes.  */
 	if (iap->ia_valid & ATTR_SIZE) {
 		if (iap->ia_size < inode->i_size) {
-			err = nfsd_permission(fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE);
+			err = nfsd_permission(rqstp, fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE);
 			if (err)
 				goto out;
 		}
@@ -616,7 +616,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
 
 			sresult |= map->access;
 
-			err2 = nfsd_permission(export, dentry, map->how);
+			err2 = nfsd_permission(rqstp, export, dentry, map->how);
 			switch (err2) {
 			case nfs_ok:
 				result |= map->access;
@@ -1043,7 +1043,7 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	__be32		err;
 
 	if (file) {
-		err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,
+		err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
 				MAY_READ|MAY_OWNER_OVERRIDE);
 		if (err)
 			goto out;
@@ -1072,7 +1072,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	__be32			err = 0;
 
 	if (file) {
-		err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,
+		err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
 				MAY_WRITE|MAY_OWNER_OVERRIDE);
 		if (err)
 			goto out;
@@ -1801,7 +1801,8 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
  * Check for a user's access permissions to this inode.
  */
 __be32
-nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc)
+nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
+					struct dentry *dentry, int acc)
 {
 	struct inode	*inode = dentry->d_inode;
 	int		err;
@@ -1832,7 +1833,7 @@ nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc)
 	 */
 	if (!(acc & MAY_LOCAL_ACCESS))
 		if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) {
-			if (EX_RDONLY(exp) || IS_RDONLY(inode))
+			if (EX_RDONLY(exp, rqstp) || IS_RDONLY(inode))
 				return nfserr_rofs;
 			if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))
 				return nfserr_perm;
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index 424be41130ba..a01f775cb944 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -112,10 +112,21 @@ struct svc_expkey {
 
 #define EX_SECURE(exp)		(!((exp)->ex_flags & NFSEXP_INSECURE_PORT))
 #define EX_ISSYNC(exp)		(!((exp)->ex_flags & NFSEXP_ASYNC))
-#define EX_RDONLY(exp)		((exp)->ex_flags & NFSEXP_READONLY)
 #define EX_NOHIDE(exp)		((exp)->ex_flags & NFSEXP_NOHIDE)
 #define EX_WGATHER(exp)		((exp)->ex_flags & NFSEXP_GATHERED_WRITES)
 
+static inline int EX_RDONLY(struct svc_export *exp, struct svc_rqst *rqstp)
+{
+	struct exp_flavor_info *f;
+	struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+
+	for (f = exp->ex_flavors; f < end; f++) {
+		if (f->pseudoflavor == rqstp->rq_flavor)
+			return f->flags & NFSEXP_READONLY;
+	}
+	return exp->ex_flags & NFSEXP_READONLY;
+}
+
 __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp);
 
 /*
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 62499c2f0918..54ef1a18a56c 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -119,7 +119,8 @@ __be32		nfsd_statfs(struct svc_rqst *, struct svc_fh *,
 				struct kstatfs *);
 
 int		nfsd_notify_change(struct inode *, struct iattr *);
-__be32		nfsd_permission(struct svc_export *, struct dentry *, int);
+__be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
+				struct dentry *, int);
 int		nfsd_sync_dir(struct dentry *dp);
 
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
-- 
cgit v1.3-8-gc7d7


From dcb488a3b7ac3987e21148f44f641c9b2e734232 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@citi.umich.edu>
Date: Tue, 17 Jul 2007 04:04:51 -0700
Subject: knfsd: nfsd4: implement secinfo

Implement the secinfo operation.

(Thanks to Usha Ketineni wrote an earlier version of this support.)

Cc: Usha Ketineni <uketinen@us.ibm.com>
Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/nfs4proc.c        | 28 ++++++++++++++++++
 fs/nfsd/nfs4xdr.c         | 75 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfsd/nfsd.h |  3 ++
 include/linux/nfsd/xdr4.h |  7 +++++
 4 files changed, 113 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index a106e3be7c13..3c627128e205 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -47,6 +47,7 @@
 #include <linux/nfsd/state.h>
 #include <linux/nfsd/xdr4.h>
 #include <linux/nfs4_acl.h>
+#include <linux/sunrpc/gss_api.h>
 
 #define NFSDDBG_FACILITY		NFSDDBG_PROC
 
@@ -609,6 +610,30 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return status;
 }
 
+static __be32
+nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	      struct nfsd4_secinfo *secinfo)
+{
+	struct svc_fh resfh;
+	struct svc_export *exp;
+	struct dentry *dentry;
+	__be32 err;
+
+	fh_init(&resfh, NFS4_FHSIZE);
+	err = nfsd_lookup_dentry(rqstp, &cstate->current_fh,
+				    secinfo->si_name, secinfo->si_namelen,
+				    &exp, &dentry);
+	if (err)
+		return err;
+	if (dentry->d_inode == NULL) {
+		exp_put(exp);
+		err = nfserr_noent;
+	} else
+		secinfo->si_exp = exp;
+	dput(dentry);
+	return err;
+}
+
 static __be32
 nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	      struct nfsd4_setattr *setattr)
@@ -1008,6 +1033,9 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
 	[OP_SAVEFH] = {
 		.op_func = (nfsd4op_func)nfsd4_savefh,
 	},
+	[OP_SECINFO] = {
+		.op_func = (nfsd4op_func)nfsd4_secinfo,
+	},
 	[OP_SETATTR] = {
 		.op_func = (nfsd4op_func)nfsd4_setattr,
 	},
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index b0bfbda375e1..864498f8f2d9 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -56,6 +56,7 @@
 #include <linux/nfsd_idmap.h>
 #include <linux/nfs4.h>
 #include <linux/nfs4_acl.h>
+#include <linux/sunrpc/gss_api.h>
 
 #define NFSDDBG_FACILITY		NFSDDBG_XDR
 
@@ -818,6 +819,23 @@ nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
 	DECODE_TAIL;
 }
 
+static __be32
+nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
+		     struct nfsd4_secinfo *secinfo)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	READ32(secinfo->si_namelen);
+	READ_BUF(secinfo->si_namelen);
+	SAVEMEM(secinfo->si_name, secinfo->si_namelen);
+	status = check_filename(secinfo->si_name, secinfo->si_namelen,
+								nfserr_noent);
+	if (status)
+		return status;
+	DECODE_TAIL;
+}
+
 static __be32
 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
 {
@@ -1131,6 +1149,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 		case OP_SAVEFH:
 			op->status = nfs_ok;
 			break;
+		case OP_SECINFO:
+			op->status = nfsd4_decode_secinfo(argp, &op->u.secinfo);
+			break;
 		case OP_SETATTR:
 			op->status = nfsd4_decode_setattr(argp, &op->u.setattr);
 			break;
@@ -1847,11 +1868,19 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
 	if (d_mountpoint(dentry)) {
 		int err;
 
+		/*
+		 * Why the heck aren't we just using nfsd_lookup??
+		 * Different "."/".." handling?  Something else?
+		 * At least, add a comment here to explain....
+		 */
 		err = nfsd_cross_mnt(cd->rd_rqstp, &dentry, &exp);
 		if (err) {
 			nfserr = nfserrno(err);
 			goto out_put;
 		}
+		nfserr = check_nfsd_access(exp, cd->rd_rqstp);
+		if (nfserr)
+			goto out_put;
 
 	}
 	nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval,
@@ -2419,6 +2448,49 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 	}
 }
 
+static void
+nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, int nfserr,
+		     struct nfsd4_secinfo *secinfo)
+{
+	int i = 0;
+	struct svc_export *exp = secinfo->si_exp;
+	ENCODE_HEAD;
+
+	if (nfserr)
+		goto out;
+	RESERVE_SPACE(4);
+	WRITE32(exp->ex_nflavors);
+	ADJUST_ARGS();
+	for (i = 0; i < exp->ex_nflavors; i++) {
+		u32 flav = exp->ex_flavors[i].pseudoflavor;
+		struct gss_api_mech *gm = gss_mech_get_by_pseudoflavor(flav);
+
+		if (gm) {
+			RESERVE_SPACE(4);
+			WRITE32(RPC_AUTH_GSS);
+			ADJUST_ARGS();
+			RESERVE_SPACE(4 + gm->gm_oid.len);
+			WRITE32(gm->gm_oid.len);
+			WRITEMEM(gm->gm_oid.data, gm->gm_oid.len);
+			ADJUST_ARGS();
+			RESERVE_SPACE(4);
+			WRITE32(0); /* qop */
+			ADJUST_ARGS();
+			RESERVE_SPACE(4);
+			WRITE32(gss_pseudoflavor_to_service(gm, flav));
+			ADJUST_ARGS();
+			gss_mech_put(gm);
+		} else {
+			RESERVE_SPACE(4);
+			WRITE32(flav);
+			ADJUST_ARGS();
+		}
+	}
+out:
+	if (exp)
+		exp_put(exp);
+}
+
 /*
  * The SETATTR encode routine is special -- it always encodes a bitmap,
  * regardless of the error status.
@@ -2559,6 +2631,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 		break;
 	case OP_SAVEFH:
 		break;
+	case OP_SECINFO:
+		nfsd4_encode_secinfo(resp, op->status, &op->u.secinfo);
+		break;
 	case OP_SETATTR:
 		nfsd4_encode_setattr(resp, op->status, &op->u.setattr);
 		break;
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 54ef1a18a56c..e452256d3f72 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -71,6 +71,9 @@ int		nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 		                struct svc_export **expp);
 __be32		nfsd_lookup(struct svc_rqst *, struct svc_fh *,
 				const char *, int, struct svc_fh *);
+__be32		 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
+				const char *, int,
+				struct svc_export **, struct dentry **);
 __be32		nfsd_setattr(struct svc_rqst *, struct svc_fh *,
 				struct iattr *, int, time_t);
 #ifdef CONFIG_NFSD_V4
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index 09799bcee0ac..1b653267133a 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -293,6 +293,12 @@ struct nfsd4_rename {
 	struct nfsd4_change_info  rn_tinfo; /* response */
 };
 
+struct nfsd4_secinfo {
+	u32 si_namelen;					/* request */
+	char *si_name;					/* request */
+	struct svc_export *si_exp;			/* response */
+};
+
 struct nfsd4_setattr {
 	stateid_t	sa_stateid;         /* request */
 	u32		sa_bmval[2];        /* request */
@@ -365,6 +371,7 @@ struct nfsd4_op {
 		struct nfsd4_remove		remove;
 		struct nfsd4_rename		rename;
 		clientid_t			renew;
+		struct nfsd4_secinfo		secinfo;
 		struct nfsd4_setattr		setattr;
 		struct nfsd4_setclientid	setclientid;
 		struct nfsd4_setclientid_confirm setclientid_confirm;
-- 
cgit v1.3-8-gc7d7


From 4796f45740bc6f2e3e6cc14e7ed481b38bd0bd39 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 17 Jul 2007 04:04:51 -0700
Subject: knfsd: nfsd4: secinfo handling without secinfo= option

We could return some sort of error in the case where someone asks for secinfo
on an export without the secinfo= option set--that'd be no worse than what
we've been doing.  But it's not really correct.  So, hack up an approximate
secinfo response in that case--it may not be complete, but it'll tell the
client at least one acceptable security flavor.

Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/nfs4xdr.c                  | 30 +++++++++++++++++++++++++++---
 include/linux/sunrpc/svcauth_gss.h |  1 +
 net/sunrpc/auth_gss/svcauth_gss.c  |  9 +++++++++
 3 files changed, 37 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 864498f8f2d9..b3d55c6747fd 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -57,6 +57,7 @@
 #include <linux/nfs4.h>
 #include <linux/nfs4_acl.h>
 #include <linux/sunrpc/gss_api.h>
+#include <linux/sunrpc/svcauth_gss.h>
 
 #define NFSDDBG_FACILITY		NFSDDBG_XDR
 
@@ -2454,15 +2455,38 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, int nfserr,
 {
 	int i = 0;
 	struct svc_export *exp = secinfo->si_exp;
+	u32 nflavs;
+	struct exp_flavor_info *flavs;
+	struct exp_flavor_info def_flavs[2];
 	ENCODE_HEAD;
 
 	if (nfserr)
 		goto out;
+	if (exp->ex_nflavors) {
+		flavs = exp->ex_flavors;
+		nflavs = exp->ex_nflavors;
+	} else { /* Handling of some defaults in absence of real secinfo: */
+		flavs = def_flavs;
+		if (exp->ex_client->flavour->flavour == RPC_AUTH_UNIX) {
+			nflavs = 2;
+			flavs[0].pseudoflavor = RPC_AUTH_UNIX;
+			flavs[1].pseudoflavor = RPC_AUTH_NULL;
+		} else if (exp->ex_client->flavour->flavour == RPC_AUTH_GSS) {
+			nflavs = 1;
+			flavs[0].pseudoflavor
+					= svcauth_gss_flavor(exp->ex_client);
+		} else {
+			nflavs = 1;
+			flavs[0].pseudoflavor
+					= exp->ex_client->flavour->flavour;
+		}
+	}
+
 	RESERVE_SPACE(4);
-	WRITE32(exp->ex_nflavors);
+	WRITE32(nflavs);
 	ADJUST_ARGS();
-	for (i = 0; i < exp->ex_nflavors; i++) {
-		u32 flav = exp->ex_flavors[i].pseudoflavor;
+	for (i = 0; i < nflavs; i++) {
+		u32 flav = flavs[i].pseudoflavor;
 		struct gss_api_mech *gm = gss_mech_get_by_pseudoflavor(flav);
 
 		if (gm) {
diff --git a/include/linux/sunrpc/svcauth_gss.h b/include/linux/sunrpc/svcauth_gss.h
index 5a5db16ab660..417a1def56db 100644
--- a/include/linux/sunrpc/svcauth_gss.h
+++ b/include/linux/sunrpc/svcauth_gss.h
@@ -22,6 +22,7 @@
 int gss_svc_init(void);
 void gss_svc_shutdown(void);
 int svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name);
+u32 svcauth_gss_flavor(struct auth_domain *dom);
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_SVCAUTH_GSS_H */
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index e4b3de08b040..490697542fc2 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -743,6 +743,15 @@ find_gss_auth_domain(struct gss_ctx *ctx, u32 svc)
 
 static struct auth_ops svcauthops_gss;
 
+u32 svcauth_gss_flavor(struct auth_domain *dom)
+{
+	struct gss_domain *gd = container_of(dom, struct gss_domain, h);
+
+	return gd->pseudoflavor;
+}
+
+EXPORT_SYMBOL(svcauth_gss_flavor);
+
 int
 svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name)
 {
-- 
cgit v1.3-8-gc7d7


From 1269bc69b6649282091bb7007372acf4ab8357fd Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 17 Jul 2007 04:04:52 -0700
Subject: knfsd: nfsd: enforce per-flavor id squashing

Allow root squashing to vary per-pseudoflavor, so that you can (for example)
allow root access only when sufficiently strong security is in use.

Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/auth.c              | 18 ++++++++++++++++--
 include/linux/nfsd/export.h |  3 ++-
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 6e92b0fe5323..cf61dc8ae942 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -12,17 +12,31 @@
 
 #define	CAP_NFSD_MASK (CAP_FS_MASK|CAP_TO_MASK(CAP_SYS_RESOURCE))
 
+static int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
+{
+	struct exp_flavor_info *f;
+	struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+
+	for (f = exp->ex_flavors; f < end; f++) {
+		if (f->pseudoflavor == rqstp->rq_flavor)
+			return f->flags;
+	}
+	return exp->ex_flags;
+
+}
+
 int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 {
 	struct svc_cred	cred = rqstp->rq_cred;
 	int i;
+	int flags = nfsexp_flags(rqstp, exp);
 	int ret;
 
-	if (exp->ex_flags & NFSEXP_ALLSQUASH) {
+	if (flags & NFSEXP_ALLSQUASH) {
 		cred.cr_uid = exp->ex_anon_uid;
 		cred.cr_gid = exp->ex_anon_gid;
 		cred.cr_group_info = groups_alloc(0);
-	} else if (exp->ex_flags & NFSEXP_ROOTSQUASH) {
+	} else if (flags & NFSEXP_ROOTSQUASH) {
 		struct group_info *gi;
 		if (!cred.cr_uid)
 			cred.cr_uid = exp->ex_anon_uid;
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index a01f775cb944..78feb7beff75 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -43,7 +43,8 @@
 #define NFSEXP_ALLFLAGS		0xFE3F
 
 /* The flags that may vary depending on security flavor: */
-#define NFSEXP_SECINFO_FLAGS	0
+#define NFSEXP_SECINFO_FLAGS	(NFSEXP_READONLY | NFSEXP_ROOTSQUASH \
+					| NFSEXP_ALLSQUASH)
 
 #ifdef __KERNEL__
 
-- 
cgit v1.3-8-gc7d7


From 2e774c7caf84455d5e7d492d123bad6f417818b5 Mon Sep 17 00:00:00 2001
From: Mark Zhan <rongkai.zhan@windriver.com>
Date: Tue, 17 Jul 2007 04:05:05 -0700
Subject: rtc: add support for the ST M48T59 RTC

[akpm@linux-foundation.org: x86_64 build fix]
[akpm@linux-foundation.org: The acpi guys changed the bin_attribute code]
Signed-off-by: Mark Zhan <rongkai.zhan@windriver.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/Kconfig        |  10 +
 drivers/rtc/Makefile       |   1 +
 drivers/rtc/rtc-m48t59.c   | 491 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/rtc/m48t59.h |  57 ++++++
 4 files changed, 559 insertions(+)
 create mode 100644 drivers/rtc/rtc-m48t59.c
 create mode 100644 include/linux/rtc/m48t59.h

(limited to 'include/linux')

diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 359323378c55..bcd18765a4ad 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -310,6 +310,16 @@ config RTC_DRV_M48T86
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-m48t86.
 
+config RTC_DRV_M48T59
+	tristate "ST M48T59"
+	depends on RTC_CLASS
+	help
+	  If you say Y here you will get support for the
+	  ST M48T59 RTC chip.
+
+	  This driver can also be built as a module, if so, the module
+	  will be called "rtc-m48t59".
+
 config RTC_DRV_V3020
 	tristate "EM Microelectronic V3020"
 	depends on RTC_CLASS
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 6da5102e0f21..ca9166d319b8 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -43,3 +43,4 @@ obj-$(CONFIG_RTC_DRV_V3020)	+= rtc-v3020.o
 obj-$(CONFIG_RTC_DRV_AT91RM9200)+= rtc-at91rm9200.o
 obj-$(CONFIG_RTC_DRV_SH)	+= rtc-sh.o
 obj-$(CONFIG_RTC_DRV_BFIN)	+= rtc-bfin.o
+obj-$(CONFIG_RTC_DRV_M48T59)	+= rtc-m48t59.o
diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c
new file mode 100644
index 000000000000..33b752350ab5
--- /dev/null
+++ b/drivers/rtc/rtc-m48t59.c
@@ -0,0 +1,491 @@
+/*
+ * ST M48T59 RTC driver
+ *
+ * Copyright (c) 2007 Wind River Systems, Inc.
+ *
+ * Author: Mark Zhan <rongkai.zhan@windriver.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/device.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+#include <linux/rtc/m48t59.h>
+#include <linux/bcd.h>
+
+#ifndef NO_IRQ
+#define NO_IRQ	(-1)
+#endif
+
+#define M48T59_READ(reg)	pdata->read_byte(dev, reg)
+#define M48T59_WRITE(val, reg)	pdata->write_byte(dev, reg, val)
+
+#define M48T59_SET_BITS(mask, reg)	\
+	M48T59_WRITE((M48T59_READ(reg) | (mask)), (reg))
+#define M48T59_CLEAR_BITS(mask, reg)	\
+	M48T59_WRITE((M48T59_READ(reg) & ~(mask)), (reg))
+
+struct m48t59_private {
+	void __iomem *ioaddr;
+	unsigned int size; /* iomem size */
+	unsigned int irq;
+	struct rtc_device *rtc;
+	spinlock_t lock; /* serialize the NVRAM and RTC access */
+};
+
+/*
+ * This is the generic access method when the chip is memory-mapped
+ */
+static void
+m48t59_mem_writeb(struct device *dev, u32 ofs, u8 val)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct m48t59_private *m48t59 = platform_get_drvdata(pdev);
+
+	writeb(val, m48t59->ioaddr+ofs);
+}
+
+static u8
+m48t59_mem_readb(struct device *dev, u32 ofs)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct m48t59_private *m48t59 = platform_get_drvdata(pdev);
+
+	return readb(m48t59->ioaddr+ofs);
+}
+
+/*
+ * NOTE: M48T59 only uses BCD mode
+ */
+static int m48t59_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct m48t59_plat_data *pdata = pdev->dev.platform_data;
+	struct m48t59_private *m48t59 = platform_get_drvdata(pdev);
+	unsigned long flags;
+	u8 val;
+
+	spin_lock_irqsave(&m48t59->lock, flags);
+	/* Issue the READ command */
+	M48T59_SET_BITS(M48T59_CNTL_READ, M48T59_CNTL);
+
+	tm->tm_year	= BCD2BIN(M48T59_READ(M48T59_YEAR));
+	/* tm_mon is 0-11 */
+	tm->tm_mon	= BCD2BIN(M48T59_READ(M48T59_MONTH)) - 1;
+	tm->tm_mday	= BCD2BIN(M48T59_READ(M48T59_MDAY));
+
+	val = M48T59_READ(M48T59_WDAY);
+	if ((val & M48T59_WDAY_CEB) && (val & M48T59_WDAY_CB)) {
+		dev_dbg(dev, "Century bit is enabled\n");
+		tm->tm_year += 100;	/* one century */
+	}
+
+	tm->tm_wday	= BCD2BIN(val & 0x07);
+	tm->tm_hour	= BCD2BIN(M48T59_READ(M48T59_HOUR) & 0x3F);
+	tm->tm_min	= BCD2BIN(M48T59_READ(M48T59_MIN) & 0x7F);
+	tm->tm_sec	= BCD2BIN(M48T59_READ(M48T59_SEC) & 0x7F);
+
+	/* Clear the READ bit */
+	M48T59_CLEAR_BITS(M48T59_CNTL_READ, M48T59_CNTL);
+	spin_unlock_irqrestore(&m48t59->lock, flags);
+
+	dev_dbg(dev, "RTC read time %04d-%02d-%02d %02d/%02d/%02d\n",
+		tm->tm_year + 1900, tm->tm_mon, tm->tm_mday,
+		tm->tm_hour, tm->tm_min, tm->tm_sec);
+	return 0;
+}
+
+static int m48t59_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct m48t59_plat_data *pdata = pdev->dev.platform_data;
+	struct m48t59_private *m48t59 = platform_get_drvdata(pdev);
+	unsigned long flags;
+	u8 val = 0;
+
+	dev_dbg(dev, "RTC set time %04d-%02d-%02d %02d/%02d/%02d\n",
+		tm->tm_year + 1900, tm->tm_mon, tm->tm_mday,
+		tm->tm_hour, tm->tm_min, tm->tm_sec);
+
+	spin_lock_irqsave(&m48t59->lock, flags);
+	/* Issue the WRITE command */
+	M48T59_SET_BITS(M48T59_CNTL_WRITE, M48T59_CNTL);
+
+	M48T59_WRITE((BIN2BCD(tm->tm_sec) & 0x7F), M48T59_SEC);
+	M48T59_WRITE((BIN2BCD(tm->tm_min) & 0x7F), M48T59_MIN);
+	M48T59_WRITE((BIN2BCD(tm->tm_hour) & 0x3F), M48T59_HOUR);
+	M48T59_WRITE((BIN2BCD(tm->tm_mday) & 0x3F), M48T59_MDAY);
+	/* tm_mon is 0-11 */
+	M48T59_WRITE((BIN2BCD(tm->tm_mon + 1) & 0x1F), M48T59_MONTH);
+	M48T59_WRITE(BIN2BCD(tm->tm_year % 100), M48T59_YEAR);
+
+	if (tm->tm_year/100)
+		val = (M48T59_WDAY_CEB | M48T59_WDAY_CB);
+	val |= (BIN2BCD(tm->tm_wday) & 0x07);
+	M48T59_WRITE(val, M48T59_WDAY);
+
+	/* Clear the WRITE bit */
+	M48T59_CLEAR_BITS(M48T59_CNTL_WRITE, M48T59_CNTL);
+	spin_unlock_irqrestore(&m48t59->lock, flags);
+	return 0;
+}
+
+/*
+ * Read alarm time and date in RTC
+ */
+static int m48t59_rtc_readalarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct m48t59_plat_data *pdata = pdev->dev.platform_data;
+	struct m48t59_private *m48t59 = platform_get_drvdata(pdev);
+	struct rtc_time *tm = &alrm->time;
+	unsigned long flags;
+	u8 val;
+
+	/* If no irq, we don't support ALARM */
+	if (m48t59->irq == NO_IRQ)
+		return -EIO;
+
+	spin_lock_irqsave(&m48t59->lock, flags);
+	/* Issue the READ command */
+	M48T59_SET_BITS(M48T59_CNTL_READ, M48T59_CNTL);
+
+	tm->tm_year = BCD2BIN(M48T59_READ(M48T59_YEAR));
+	/* tm_mon is 0-11 */
+	tm->tm_mon = BCD2BIN(M48T59_READ(M48T59_MONTH)) - 1;
+
+	val = M48T59_READ(M48T59_WDAY);
+	if ((val & M48T59_WDAY_CEB) && (val & M48T59_WDAY_CB))
+		tm->tm_year += 100;	/* one century */
+
+	tm->tm_mday = BCD2BIN(M48T59_READ(M48T59_ALARM_DATE));
+	tm->tm_hour = BCD2BIN(M48T59_READ(M48T59_ALARM_HOUR));
+	tm->tm_min = BCD2BIN(M48T59_READ(M48T59_ALARM_MIN));
+	tm->tm_sec = BCD2BIN(M48T59_READ(M48T59_ALARM_SEC));
+
+	/* Clear the READ bit */
+	M48T59_CLEAR_BITS(M48T59_CNTL_READ, M48T59_CNTL);
+	spin_unlock_irqrestore(&m48t59->lock, flags);
+
+	dev_dbg(dev, "RTC read alarm time %04d-%02d-%02d %02d/%02d/%02d\n",
+		tm->tm_year + 1900, tm->tm_mon, tm->tm_mday,
+		tm->tm_hour, tm->tm_min, tm->tm_sec);
+	return 0;
+}
+
+/*
+ * Set alarm time and date in RTC
+ */
+static int m48t59_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct m48t59_plat_data *pdata = pdev->dev.platform_data;
+	struct m48t59_private *m48t59 = platform_get_drvdata(pdev);
+	struct rtc_time *tm = &alrm->time;
+	u8 mday, hour, min, sec;
+	unsigned long flags;
+
+	/* If no irq, we don't support ALARM */
+	if (m48t59->irq == NO_IRQ)
+		return -EIO;
+
+	/*
+	 * 0xff means "always match"
+	 */
+	mday = tm->tm_mday;
+	mday = (mday >= 1 && mday <= 31) ? BIN2BCD(mday) : 0xff;
+	if (mday == 0xff)
+		mday = M48T59_READ(M48T59_MDAY);
+
+	hour = tm->tm_hour;
+	hour = (hour < 24) ? BIN2BCD(hour) : 0x00;
+
+	min = tm->tm_min;
+	min = (min < 60) ? BIN2BCD(min) : 0x00;
+
+	sec = tm->tm_sec;
+	sec = (sec < 60) ? BIN2BCD(sec) : 0x00;
+
+	spin_lock_irqsave(&m48t59->lock, flags);
+	/* Issue the WRITE command */
+	M48T59_SET_BITS(M48T59_CNTL_WRITE, M48T59_CNTL);
+
+	M48T59_WRITE(mday, M48T59_ALARM_DATE);
+	M48T59_WRITE(hour, M48T59_ALARM_HOUR);
+	M48T59_WRITE(min, M48T59_ALARM_MIN);
+	M48T59_WRITE(sec, M48T59_ALARM_SEC);
+
+	/* Clear the WRITE bit */
+	M48T59_CLEAR_BITS(M48T59_CNTL_WRITE, M48T59_CNTL);
+	spin_unlock_irqrestore(&m48t59->lock, flags);
+
+	dev_dbg(dev, "RTC set alarm time %04d-%02d-%02d %02d/%02d/%02d\n",
+		tm->tm_year + 1900, tm->tm_mon, tm->tm_mday,
+		tm->tm_hour, tm->tm_min, tm->tm_sec);
+	return 0;
+}
+
+/*
+ * Handle commands from user-space
+ */
+static int m48t59_rtc_ioctl(struct device *dev, unsigned int cmd,
+			unsigned long arg)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct m48t59_plat_data *pdata = pdev->dev.platform_data;
+	struct m48t59_private *m48t59 = platform_get_drvdata(pdev);
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&m48t59->lock, flags);
+	switch (cmd) {
+	case RTC_AIE_OFF:	/* alarm interrupt off */
+		M48T59_WRITE(0x00, M48T59_INTR);
+		break;
+	case RTC_AIE_ON:	/* alarm interrupt on */
+		M48T59_WRITE(M48T59_INTR_AFE, M48T59_INTR);
+		break;
+	default:
+		ret = -ENOIOCTLCMD;
+		break;
+	}
+	spin_unlock_irqrestore(&m48t59->lock, flags);
+
+	return ret;
+}
+
+static int m48t59_rtc_proc(struct device *dev, struct seq_file *seq)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct m48t59_plat_data *pdata = pdev->dev.platform_data;
+	struct m48t59_private *m48t59 = platform_get_drvdata(pdev);
+	unsigned long flags;
+	u8 val;
+
+	spin_lock_irqsave(&m48t59->lock, flags);
+	val = M48T59_READ(M48T59_FLAGS);
+	spin_unlock_irqrestore(&m48t59->lock, flags);
+
+	seq_printf(seq, "battery\t\t: %s\n",
+		 (val & M48T59_FLAGS_BF) ? "low" : "normal");
+	return 0;
+}
+
+/*
+ * IRQ handler for the RTC
+ */
+static irqreturn_t m48t59_rtc_interrupt(int irq, void *dev_id)
+{
+	struct device *dev = (struct device *)dev_id;
+	struct platform_device *pdev = to_platform_device(dev);
+	struct m48t59_plat_data *pdata = pdev->dev.platform_data;
+	struct m48t59_private *m48t59 = platform_get_drvdata(pdev);
+	u8 event;
+
+	spin_lock(&m48t59->lock);
+	event = M48T59_READ(M48T59_FLAGS);
+	spin_unlock(&m48t59->lock);
+
+	if (event & M48T59_FLAGS_AF) {
+		rtc_update_irq(m48t59->rtc, 1, (RTC_AF | RTC_IRQF));
+		return IRQ_HANDLED;
+	}
+
+	return IRQ_NONE;
+}
+
+static const struct rtc_class_ops m48t59_rtc_ops = {
+	.ioctl		= m48t59_rtc_ioctl,
+	.read_time	= m48t59_rtc_read_time,
+	.set_time	= m48t59_rtc_set_time,
+	.read_alarm	= m48t59_rtc_readalarm,
+	.set_alarm	= m48t59_rtc_setalarm,
+	.proc		= m48t59_rtc_proc,
+};
+
+static ssize_t m48t59_nvram_read(struct kobject *kobj,
+				struct bin_attribute *bin_attr,
+				char *buf, loff_t pos, size_t size)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct platform_device *pdev = to_platform_device(dev);
+	struct m48t59_plat_data *pdata = pdev->dev.platform_data;
+	struct m48t59_private *m48t59 = platform_get_drvdata(pdev);
+	ssize_t cnt = 0;
+	unsigned long flags;
+
+	for (; size > 0 && pos < M48T59_NVRAM_SIZE; cnt++, size--) {
+		spin_lock_irqsave(&m48t59->lock, flags);
+		*buf++ = M48T59_READ(cnt);
+		spin_unlock_irqrestore(&m48t59->lock, flags);
+	}
+
+	return cnt;
+}
+
+static ssize_t m48t59_nvram_write(struct kobject *kobj,
+				struct bin_attribute *bin_attr,
+				char *buf, loff_t pos, size_t size)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct platform_device *pdev = to_platform_device(dev);
+	struct m48t59_plat_data *pdata = pdev->dev.platform_data;
+	struct m48t59_private *m48t59 = platform_get_drvdata(pdev);
+	ssize_t cnt = 0;
+	unsigned long flags;
+
+	for (; size > 0 && pos < M48T59_NVRAM_SIZE; cnt++, size--) {
+		spin_lock_irqsave(&m48t59->lock, flags);
+		M48T59_WRITE(*buf++, cnt);
+		spin_unlock_irqrestore(&m48t59->lock, flags);
+	}
+
+	return cnt;
+}
+
+static struct bin_attribute m48t59_nvram_attr = {
+	.attr = {
+		.name = "nvram",
+		.mode = S_IRUGO | S_IWUGO,
+		.owner = THIS_MODULE,
+	},
+	.read = m48t59_nvram_read,
+	.write = m48t59_nvram_write,
+};
+
+static int __devinit m48t59_rtc_probe(struct platform_device *pdev)
+{
+	struct m48t59_plat_data *pdata = pdev->dev.platform_data;
+	struct m48t59_private *m48t59 = NULL;
+	struct resource *res;
+	int ret = -ENOMEM;
+
+	/* This chip could be memory-mapped or I/O-mapped */
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+		if (!res)
+			return -EINVAL;
+	}
+
+	if (res->flags & IORESOURCE_IO) {
+		/* If we are I/O-mapped, the platform should provide
+		 * the operations accessing chip registers.
+		 */
+		if (!pdata || !pdata->write_byte || !pdata->read_byte)
+			return -EINVAL;
+	} else if (res->flags & IORESOURCE_MEM) {
+		/* we are memory-mapped */
+		if (!pdata) {
+			pdata = kzalloc(sizeof(*pdata), GFP_KERNEL);
+			if (!pdata)
+				return -ENOMEM;
+			/* Ensure we only kmalloc platform data once */
+			pdev->dev.platform_data = pdata;
+		}
+
+		/* Try to use the generic memory read/write ops */
+		if (!pdata->write_byte)
+			pdata->write_byte = m48t59_mem_writeb;
+		if (!pdata->read_byte)
+			pdata->read_byte = m48t59_mem_readb;
+	}
+
+	m48t59 = kzalloc(sizeof(*m48t59), GFP_KERNEL);
+	if (!m48t59)
+		return -ENOMEM;
+
+	m48t59->size = res->end - res->start + 1;
+	m48t59->ioaddr = ioremap(res->start, m48t59->size);
+	if (!m48t59->ioaddr)
+		goto out;
+
+	/* Try to get irq number. We also can work in
+	 * the mode without IRQ.
+	 */
+	m48t59->irq = platform_get_irq(pdev, 0);
+	if (m48t59->irq < 0)
+		m48t59->irq = NO_IRQ;
+
+	if (m48t59->irq != NO_IRQ) {
+		ret = request_irq(m48t59->irq, m48t59_rtc_interrupt,
+			IRQF_SHARED, "rtc-m48t59", &pdev->dev);
+		if (ret)
+			goto out;
+	}
+
+	m48t59->rtc = rtc_device_register("m48t59", &pdev->dev,
+				&m48t59_rtc_ops, THIS_MODULE);
+	if (IS_ERR(m48t59->rtc)) {
+		ret = PTR_ERR(m48t59->rtc);
+		goto out;
+	}
+
+	ret = sysfs_create_bin_file(&pdev->dev.kobj, &m48t59_nvram_attr);
+	if (ret)
+		goto out;
+
+	spin_lock_init(&m48t59->lock);
+	platform_set_drvdata(pdev, m48t59);
+	return 0;
+
+out:
+	if (!IS_ERR(m48t59->rtc))
+		rtc_device_unregister(m48t59->rtc);
+	if (m48t59->irq != NO_IRQ)
+		free_irq(m48t59->irq, &pdev->dev);
+	if (m48t59->ioaddr)
+		iounmap(m48t59->ioaddr);
+	if (m48t59)
+		kfree(m48t59);
+	return ret;
+}
+
+static int __devexit m48t59_rtc_remove(struct platform_device *pdev)
+{
+	struct m48t59_private *m48t59 = platform_get_drvdata(pdev);
+
+	sysfs_remove_bin_file(&pdev->dev.kobj, &m48t59_nvram_attr);
+	if (!IS_ERR(m48t59->rtc))
+		rtc_device_unregister(m48t59->rtc);
+	if (m48t59->ioaddr)
+		iounmap(m48t59->ioaddr);
+	if (m48t59->irq != NO_IRQ)
+		free_irq(m48t59->irq, &pdev->dev);
+	platform_set_drvdata(pdev, NULL);
+	kfree(m48t59);
+	return 0;
+}
+
+static struct platform_driver m48t59_rtc_platdrv = {
+	.driver		= {
+		.name	= "rtc-m48t59",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= m48t59_rtc_probe,
+	.remove		= __devexit_p(m48t59_rtc_remove),
+};
+
+static int __init m48t59_rtc_init(void)
+{
+	return platform_driver_register(&m48t59_rtc_platdrv);
+}
+
+static void __exit m48t59_rtc_exit(void)
+{
+	platform_driver_unregister(&m48t59_rtc_platdrv);
+}
+
+module_init(m48t59_rtc_init);
+module_exit(m48t59_rtc_exit);
+
+MODULE_AUTHOR("Mark Zhan <rongkai.zhan@windriver.com>");
+MODULE_DESCRIPTION("M48T59 RTC driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/rtc/m48t59.h b/include/linux/rtc/m48t59.h
new file mode 100644
index 000000000000..e8c7c21ceb1f
--- /dev/null
+++ b/include/linux/rtc/m48t59.h
@@ -0,0 +1,57 @@
+/*
+ * include/linux/rtc/m48t59.h
+ *
+ * Definitions for the platform data of m48t59 RTC chip driver.
+ *
+ * Copyright (c) 2007 Wind River Systems, Inc.
+ *
+ * Mark Zhan <rongkai.zhan@windriver.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _LINUX_RTC_M48T59_H_
+#define _LINUX_RTC_M48T59_H_
+
+/*
+ * M48T59 Register Offset
+ */
+#define M48T59_YEAR		0x1fff
+#define M48T59_MONTH		0x1ffe
+#define M48T59_MDAY		0x1ffd	/* Day of Month */
+#define M48T59_WDAY		0x1ffc	/* Day of Week */
+#define M48T59_WDAY_CB			0x20	/* Century Bit */
+#define M48T59_WDAY_CEB			0x10	/* Century Enable Bit */
+#define M48T59_HOUR		0x1ffb
+#define M48T59_MIN		0x1ffa
+#define M48T59_SEC		0x1ff9
+#define M48T59_CNTL		0x1ff8
+#define M48T59_CNTL_READ		0x40
+#define M48T59_CNTL_WRITE		0x80
+#define M48T59_WATCHDOG		0x1ff7
+#define M48T59_INTR		0x1ff6
+#define M48T59_INTR_AFE			0x80	/* Alarm Interrupt Enable */
+#define M48T59_INTR_ABE			0x20
+#define M48T59_ALARM_DATE	0x1ff5
+#define M48T59_ALARM_HOUR	0x1ff4
+#define M48T59_ALARM_MIN	0x1ff3
+#define M48T59_ALARM_SEC	0x1ff2
+#define M48T59_UNUSED		0x1ff1
+#define M48T59_FLAGS		0x1ff0
+#define M48T59_FLAGS_WDT		0x80	/* watchdog timer expired */
+#define M48T59_FLAGS_AF			0x40	/* alarm */
+#define M48T59_FLAGS_BF			0x10	/* low battery */
+
+#define M48T59_NVRAM_SIZE	0x1ff0
+
+struct m48t59_plat_data {
+	/* The method to access M48T59 registers,
+	 * NOTE: The 'ofs' should be 0x00~0x1fff
+	 */
+	void (*write_byte)(struct device *dev, u32 ofs, u8 val);
+	unsigned char (*read_byte)(struct device *dev, u32 ofs);
+};
+
+#endif /* _LINUX_RTC_M48T59_H_ */
-- 
cgit v1.3-8-gc7d7


From 623e71b035cb5271028500720b3622ba76db42bb Mon Sep 17 00:00:00 2001
From: "Antonino A. Daplas" <adaplas@gmail.com>
Date: Tue, 17 Jul 2007 04:05:28 -0700
Subject: fbcon: allow fbcon to use the primary display driver

Allow fbcon to select the primary display adapter using the
fb_is_primary_device() arch-specific helper.  If a a primary adapter is
detected, fbcon will unbind the old adapter from the VT layer, then rebind
using the new adapter.  This requires that bind_/unbind_con_driver() be made
public.

Because this feature may produce unexpected behavior (from the user's POV),
this must be explicitly enabled in Kconfig.

[akpm@linux-foundation.org: export unbind_con_driver]
Signed-off-by: Antonino Daplas <adaplas@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/video/fbdev.c       |  4 +--
 drivers/char/vt.c             |  8 ++---
 drivers/video/console/Kconfig | 20 +++++++++++
 drivers/video/console/fbcon.c | 80 +++++++++++++++++++++++++++++++++++++++----
 include/linux/vt_kern.h       |  4 +++
 5 files changed, 102 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/video/fbdev.c b/arch/i386/video/fbdev.c
index 7fc712c46a64..48fb38d7d2c0 100644
--- a/arch/i386/video/fbdev.c
+++ b/arch/i386/video/fbdev.c
@@ -13,13 +13,11 @@
 
 int fb_is_primary_device(struct fb_info *info)
 {
-	struct device *device;
+	struct device *device = info->device;
 	struct pci_dev *pci_dev = NULL;
 	struct resource *res = NULL;
 	int retval = 0;
 
-	device = info->device;
-
 	if (device)
 		pci_dev = to_pci_dev(device);
 
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 349673d432f1..8a389b314624 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -2869,8 +2869,7 @@ int __init vty_init(void)
 
 static struct class *vtconsole_class;
 
-static int bind_con_driver(const struct consw *csw, int first, int last,
-			   int deflt)
+int bind_con_driver(const struct consw *csw, int first, int last, int deflt)
 {
 	struct module *owner = csw->owner;
 	const char *desc = NULL;
@@ -2969,6 +2968,7 @@ err:
 	module_put(owner);
 	return retval;
 };
+EXPORT_SYMBOL(bind_con_driver);
 
 #ifdef CONFIG_VT_HW_CONSOLE_BINDING
 static int con_is_graphics(const struct consw *csw, int first, int last)
@@ -2987,8 +2987,7 @@ static int con_is_graphics(const struct consw *csw, int first, int last)
 	return retval;
 }
 
-static int unbind_con_driver(const struct consw *csw, int first, int last,
-			     int deflt)
+int unbind_con_driver(const struct consw *csw, int first, int last, int deflt)
 {
 	struct module *owner = csw->owner;
 	const struct consw *defcsw = NULL;
@@ -3073,6 +3072,7 @@ err:
 	return retval;
 
 }
+EXPORT_SYMBOL(unbind_con_driver);
 
 static int vt_bind(struct con_driver *con)
 {
diff --git a/drivers/video/console/Kconfig b/drivers/video/console/Kconfig
index d3b8a6be2916..160f55f7ac96 100644
--- a/drivers/video/console/Kconfig
+++ b/drivers/video/console/Kconfig
@@ -118,6 +118,26 @@ config FRAMEBUFFER_CONSOLE
 	help
 	  Low-level framebuffer-based console driver.
 
+config FRAMEBUFFER_CONSOLE_DETECT_PRIMARY
+       bool "Map the console to the primary display device"
+       depends on FRAMEBUFFER_CONSOLE && VT_HW_CONSOLE_BINDING
+       default n
+       ---help---
+         If this option is selected, the framebuffer console will
+         automatically select the primary display device (if the architecture
+	 supports this feature).  Otherwise, the framebuffer console will
+         always select the first framebuffer driver that is loaded. The latter
+         is the default behavior.
+
+	 You can always override the automatic selection of the primary device
+	 by using the fbcon=map: boot option.
+
+	 To select this feature, "Support for binding and unbinding console
+         drivers", under "Device Drivers"->"Character Devices" must be set to
+	 y.
+
+	 If unsure, select n.
+
 config FRAMEBUFFER_CONSOLE_ROTATION
        bool "Framebuffer Console Rotation"
        depends on FRAMEBUFFER_CONSOLE
diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index 7f4ed792307c..7ba21eb1676c 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -75,6 +75,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/crc32.h> /* For counting font checksums */
+#include <asm/fb.h>
 #include <asm/irq.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -125,6 +126,8 @@ static int first_fb_vc;
 static int last_fb_vc = MAX_NR_CONSOLES - 1;
 static int fbcon_is_default = 1; 
 static int fbcon_has_exited;
+static int primary_device = -1;
+static int map_override;
 
 /* font data */
 static char fontname[40];
@@ -497,13 +500,17 @@ static int __init fb_console_setup(char *this_opt)
 		
 		if (!strncmp(options, "map:", 4)) {
 			options += 4;
-			if (*options)
+			if (*options) {
 				for (i = 0, j = 0; i < MAX_NR_CONSOLES; i++) {
 					if (!options[j])
 						j = 0;
 					con2fb_map_boot[i] =
 						(options[j++]-'0') % FB_MAX;
 				}
+
+				map_override = 1;
+			}
+
 			return 1;
 		}
 
@@ -3004,9 +3011,9 @@ static int fbcon_mode_deleted(struct fb_info *info,
 	return found;
 }
 
-static int fbcon_fb_unregistered(int idx)
+static int fbcon_fb_unregistered(struct fb_info *info)
 {
-	int i;
+	int i, idx = info->node;
 
 	for (i = first_fb_vc; i <= last_fb_vc; i++) {
 		if (con2fb_map[i] == idx)
@@ -3034,12 +3041,70 @@ static int fbcon_fb_unregistered(int idx)
 	if (!num_registered_fb)
 		unregister_con_driver(&fb_con);
 
+
+	if (primary_device == idx)
+		primary_device = -1;
+
+	return 0;
+}
+
+#ifdef CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY
+static int fbcon_select_primary(struct fb_info *info)
+{
+	int ret = 0;
+
+	if (!map_override && primary_device == -1 &&
+	    fb_is_primary_device(info)) {
+		int i, err;
+
+		printk(KERN_INFO "fbcon: %s is primary device\n",
+		       info->fix.id);
+		primary_device = info->node;
+
+		if (!con_is_bound(&fb_con))
+			goto done;
+
+		printk(KERN_INFO "fbcon: Unbinding old driver\n");
+		unbind_con_driver(&fb_con, first_fb_vc, last_fb_vc,
+				  fbcon_is_default);
+		info_idx = primary_device;
+
+		for (i = first_fb_vc; i <= last_fb_vc; i++) {
+			con2fb_map_boot[i] = primary_device;
+			con2fb_map[i] = primary_device;
+		}
+
+		printk(KERN_INFO "fbcon: Selecting new driver\n");
+		err = bind_con_driver(&fb_con, first_fb_vc, last_fb_vc,
+				      fbcon_is_default);
+
+		if (err) {
+			for (i = first_fb_vc; i <= last_fb_vc; i++)
+				con2fb_map[i] = -1;
+
+			info_idx = -1;
+		}
+
+		ret = 1;
+	}
+
+done:
+	return ret;
+}
+#else
+static inline int fbcon_select_primary(struct fb_info *info)
+{
 	return 0;
 }
+#endif /* CONFIG_FRAMEBUFFER_DETECT_PRIMARY */
 
-static int fbcon_fb_registered(int idx)
+static int fbcon_fb_registered(struct fb_info *info)
 {
-	int ret = 0, i;
+	int ret = 0, i, idx = info->node;
+
+	if (fbcon_select_primary(info))
+		goto done;
+
 
 	if (info_idx == -1) {
 		for (i = first_fb_vc; i <= last_fb_vc; i++) {
@@ -3059,6 +3124,7 @@ static int fbcon_fb_registered(int idx)
 		}
 	}
 
+done:
 	return ret;
 }
 
@@ -3182,10 +3248,10 @@ static int fbcon_event_notify(struct notifier_block *self,
 		ret = fbcon_mode_deleted(info, mode);
 		break;
 	case FB_EVENT_FB_REGISTERED:
-		ret = fbcon_fb_registered(info->node);
+		ret = fbcon_fb_registered(info);
 		break;
 	case FB_EVENT_FB_UNREGISTERED:
-		ret = fbcon_fb_unregistered(info->node);
+		ret = fbcon_fb_unregistered(info);
 		break;
 	case FB_EVENT_SET_CONSOLE_MAP:
 		con2fb = event->data;
diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h
index d961635d0e61..0d034d89ee8a 100644
--- a/include/linux/vt_kern.h
+++ b/include/linux/vt_kern.h
@@ -75,6 +75,10 @@ int con_copy_unimap(struct vc_data *dst_vc, struct vc_data *src_vc);
 int vt_waitactive(int vt);
 void change_console(struct vc_data *new_vc);
 void reset_vc(struct vc_data *vc);
+extern int bind_con_driver(const struct consw *csw, int first, int last,
+			   int deflt);
+extern int unbind_con_driver(const struct consw *csw, int first, int last,
+			     int deflt);
 
 /*
  * vc_screen.c shares this temporary buffer with the console write code so that
-- 
cgit v1.3-8-gc7d7


From cfafca8067c6defbaeb28cb898b7b3f8abdfe20d Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jesse.barnes@intel.com>
Date: Tue, 17 Jul 2007 04:05:33 -0700
Subject: fbdev: fbcon: console unregistration from unregister_framebuffer

This allows for proper console unregistration via the VT layer, and updates
the FB layer to use it.  This makes debugging new console drivers much easier,
since you can properly clean them up before unloading.

[adaplas]
unregister_framebuffer() is typically called as part of the driver's
module_exit(). Doing so otherwise will freeze the machine as the VT layer is
holding reference counts on fbcon, and fbcon on the driver.  With this change,
it allows unregister_framebuffer() to be called safely anywhere as needed.

Additions from the original:  If multiple drivers are used by fbcon, and if
one of them unregisters, a driver will take over the consoles vacated by the
outgoing one (via set_con2fb_map).   Once only the outgoing driver remains,
then fbcon will unbind from the VT layer (if CONFIG_HW_CONSOLE_UNBINDING is
set to y).

It is important that these drivers implement fb_open() and fb_release()
just to ensure that no other process is using the driver. Likewise, these
drivers _must_ check the return value of unregister_framebuffer().

[akpm@linux-foundation.org: make fbcon_unbind() stub inline]
Signed-off-by: Jesse Barnes <jesse.barnes@intel.com>
Signed-off-by: Antonino Daplas <adaplas@gmail.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/console/fbcon.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 drivers/video/fbmem.c         | 26 ++++++++++++++++++++++----
 include/linux/fb.h            |  2 ++
 3 files changed, 66 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index 9b66331020dd..decfdc8eb9cc 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -3003,6 +3003,45 @@ static int fbcon_mode_deleted(struct fb_info *info,
 	return found;
 }
 
+#ifdef CONFIG_VT_HW_CONSOLE_BINDING
+static int fbcon_unbind(void)
+{
+	int ret;
+
+	ret = unbind_con_driver(&fb_con, first_fb_vc, last_fb_vc,
+				fbcon_is_default);
+	return ret;
+}
+#else
+static inline int fbcon_unbind(void)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_VT_HW_CONSOLE_BINDING */
+
+static int fbcon_fb_unbind(int idx)
+{
+	int i, new_idx = -1, ret = 0;
+
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
+		if (con2fb_map[i] != idx &&
+		    con2fb_map[i] != -1) {
+			new_idx = i;
+			break;
+		}
+	}
+
+	if (new_idx != -1) {
+		for (i = first_fb_vc; i <= last_fb_vc; i++) {
+			if (con2fb_map[i] == idx)
+				set_con2fb_map(i, new_idx, 0);
+		}
+	} else
+		ret = fbcon_unbind();
+
+	return ret;
+}
+
 static int fbcon_fb_unregistered(struct fb_info *info)
 {
 	int i, idx = info->node;
@@ -3210,6 +3249,9 @@ static int fbcon_event_notify(struct notifier_block *self,
 		mode = event->data;
 		ret = fbcon_mode_deleted(info, mode);
 		break;
+	case FB_EVENT_FB_UNBIND:
+		ret = fbcon_fb_unbind(info->node);
+		break;
 	case FB_EVENT_FB_REGISTERED:
 		ret = fbcon_fb_registered(info);
 		break;
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 8d6dbe8b8dd1..7f3a0cca0fd4 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1334,17 +1334,34 @@ register_framebuffer(struct fb_info *fb_info)
  *
  *	Returns negative errno on error, or zero for success.
  *
+ *      This function will also notify the framebuffer console
+ *      to release the driver.
+ *
+ *      This is meant to be called within a driver's module_exit()
+ *      function. If this is called outside module_exit(), ensure
+ *      that the driver implements fb_open() and fb_release() to
+ *      check that no processes are using the device.
  */
 
 int
 unregister_framebuffer(struct fb_info *fb_info)
 {
 	struct fb_event event;
-	int i;
+	int i, ret = 0;
 
 	i = fb_info->node;
-	if (!registered_fb[i])
-		return -EINVAL;
+	if (!registered_fb[i]) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	event.info = fb_info;
+	ret = fb_notifier_call_chain(FB_EVENT_FB_UNBIND, &event);
+
+	if (ret) {
+		ret = -EINVAL;
+		goto done;
+	}
 
 	if (fb_info->pixmap.addr &&
 	    (fb_info->pixmap.flags & FB_PIXMAP_DEFAULT))
@@ -1356,7 +1373,8 @@ unregister_framebuffer(struct fb_info *fb_info)
 	device_destroy(fb_class, MKDEV(FB_MAJOR, i));
 	event.info = fb_info;
 	fb_notifier_call_chain(FB_EVENT_FB_UNREGISTERED, &event);
-	return 0;
+done:
+	return ret;
 }
 
 /**
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 66226824ab68..8628423c6dd3 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -529,6 +529,8 @@ struct fb_cursor_user {
 #define FB_EVENT_CONBLANK               0x0C
 /*      Get drawing requirements        */
 #define FB_EVENT_GET_REQ                0x0D
+/*      Unbind from the console if possible */
+#define FB_EVENT_FB_UNBIND              0x0E
 
 struct fb_event {
 	struct fb_info *info;
-- 
cgit v1.3-8-gc7d7


From b7269dd2b97b9aedb64e15fdec5575345d091925 Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Tue, 17 Jul 2007 04:05:34 -0700
Subject: vt: add comment for unbind_con_driver()

- add comment for unbind_con_driver().
- bind_con_driver() is made private again

Signed-off-by: Jesse Barnes <jesse.barnes@intel.com>
Signed-off-by: Antonino Daplas <adaplas@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/vt.c       | 21 +++++++++++++++++++--
 include/linux/vt_kern.h |  2 --
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 8a389b314624..45b33cf97643 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -2869,7 +2869,8 @@ int __init vty_init(void)
 
 static struct class *vtconsole_class;
 
-int bind_con_driver(const struct consw *csw, int first, int last, int deflt)
+static int bind_con_driver(const struct consw *csw, int first, int last,
+			   int deflt)
 {
 	struct module *owner = csw->owner;
 	const char *desc = NULL;
@@ -2968,7 +2969,6 @@ err:
 	module_put(owner);
 	return retval;
 };
-EXPORT_SYMBOL(bind_con_driver);
 
 #ifdef CONFIG_VT_HW_CONSOLE_BINDING
 static int con_is_graphics(const struct consw *csw, int first, int last)
@@ -2987,6 +2987,23 @@ static int con_is_graphics(const struct consw *csw, int first, int last)
 	return retval;
 }
 
+/**
+ * unbind_con_driver - unbind a console driver
+ * @csw: pointer to console driver to unregister
+ * @first: first in range of consoles that @csw should be unbound from
+ * @last: last in range of consoles that @csw should be unbound from
+ * @deflt: should next bound console driver be default after @csw is unbound?
+ *
+ * To unbind a driver from all possible consoles, pass 0 as @first and
+ * %MAX_NR_CONSOLES as @last.
+ *
+ * @deflt controls whether the console that ends up replacing @csw should be
+ * the default console.
+ *
+ * RETURNS:
+ * -ENODEV if @csw isn't a registered console driver or can't be unregistered
+ * or 0 on success.
+ */
 int unbind_con_driver(const struct consw *csw, int first, int last, int deflt)
 {
 	struct module *owner = csw->owner;
diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h
index 0d034d89ee8a..699b7e9864fa 100644
--- a/include/linux/vt_kern.h
+++ b/include/linux/vt_kern.h
@@ -75,8 +75,6 @@ int con_copy_unimap(struct vc_data *dst_vc, struct vc_data *src_vc);
 int vt_waitactive(int vt);
 void change_console(struct vc_data *new_vc);
 void reset_vc(struct vc_data *vc);
-extern int bind_con_driver(const struct consw *csw, int first, int last,
-			   int deflt);
 extern int unbind_con_driver(const struct consw *csw, int first, int last,
 			     int deflt);
 
-- 
cgit v1.3-8-gc7d7


From 9900abfb5e8192f0eafcd9b9dd5d54011e46c76c Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Date: Tue, 17 Jul 2007 04:05:50 -0700
Subject: fbdev: Add fb_append_extra_logo()

Add fb_append_extra_logo(), to append extra lines of logos below the standard
Linux logo.

Signed-off-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Signed-off-by: Geoff Levand <geoffrey.levand@am.sony.com>
Acked-By: James Simmons <jsimmons@infradead.org>
Cc: "Antonino A. Daplas" <adaplas@pol.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/fbmem.c      | 212 ++++++++++++++++++++++++++++++---------------
 drivers/video/logo/Kconfig |   4 +
 include/linux/linux_logo.h |   8 ++
 3 files changed, 155 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 717684bde486..215ac579f901 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -404,72 +404,6 @@ static void fb_do_show_logo(struct fb_info *info, struct fb_image *image,
 	}
 }
 
-int fb_prepare_logo(struct fb_info *info, int rotate)
-{
-	int depth = fb_get_color_depth(&info->var, &info->fix);
-	int yres;
-
-	memset(&fb_logo, 0, sizeof(struct logo_data));
-
-	if (info->flags & FBINFO_MISC_TILEBLITTING ||
-	    info->flags & FBINFO_MODULE)
-		return 0;
-
-	if (info->fix.visual == FB_VISUAL_DIRECTCOLOR) {
-		depth = info->var.blue.length;
-		if (info->var.red.length < depth)
-			depth = info->var.red.length;
-		if (info->var.green.length < depth)
-			depth = info->var.green.length;
-	}
-
-	if (info->fix.visual == FB_VISUAL_STATIC_PSEUDOCOLOR && depth > 4) {
-		/* assume console colormap */
-		depth = 4;
-	}
-
-	if (depth >= 8) {
-		switch (info->fix.visual) {
-		case FB_VISUAL_TRUECOLOR:
-			fb_logo.needs_truepalette = 1;
-			break;
-		case FB_VISUAL_DIRECTCOLOR:
-			fb_logo.needs_directpalette = 1;
-			fb_logo.needs_cmapreset = 1;
-			break;
-		case FB_VISUAL_PSEUDOCOLOR:
-			fb_logo.needs_cmapreset = 1;
-			break;
-		}
-	}
-
-	/* Return if no suitable logo was found */
-	fb_logo.logo = fb_find_logo(depth);
-
-	if (!fb_logo.logo) {
-		return 0;
-	}
-	
-	if (rotate == FB_ROTATE_UR || rotate == FB_ROTATE_UD)
-		yres = info->var.yres;
-	else
-		yres = info->var.xres;
-
-	if (fb_logo.logo->height > yres) {
-		fb_logo.logo = NULL;
-		return 0;
-	}
-
-	/* What depth we asked for might be different from what we get */
-	if (fb_logo.logo->type == LINUX_LOGO_CLUT224)
-		fb_logo.depth = 8;
-	else if (fb_logo.logo->type == LINUX_LOGO_VGA16)
-		fb_logo.depth = 4;
-	else
-		fb_logo.depth = 1;		
-	return fb_logo.logo->height;
-}
-
 static int fb_show_logo_line(struct fb_info *info, int rotate,
 			     const struct linux_logo *logo, int y,
 			     unsigned int n)
@@ -489,7 +423,7 @@ static int fb_show_logo_line(struct fb_info *info, int rotate,
 	if (fb_logo.needs_cmapreset)
 		fb_set_logocmap(info, logo);
 
-	if (fb_logo.needs_truepalette || 
+	if (fb_logo.needs_truepalette ||
 	    fb_logo.needs_directpalette) {
 		palette = kmalloc(256 * 4, GFP_KERNEL);
 		if (palette == NULL)
@@ -538,10 +472,150 @@ static int fb_show_logo_line(struct fb_info *info, int rotate,
 	return logo->height;
 }
 
+
+#ifdef CONFIG_FB_LOGO_EXTRA
+
+#define FB_LOGO_EX_NUM_MAX 10
+static struct logo_data_extra {
+	const struct linux_logo *logo;
+	unsigned int n;
+} fb_logo_ex[FB_LOGO_EX_NUM_MAX];
+static unsigned int fb_logo_ex_num;
+
+void fb_append_extra_logo(const struct linux_logo *logo, unsigned int n)
+{
+	if (!n || fb_logo_ex_num == FB_LOGO_EX_NUM_MAX)
+		return;
+
+	fb_logo_ex[fb_logo_ex_num].logo = logo;
+	fb_logo_ex[fb_logo_ex_num].n = n;
+	fb_logo_ex_num++;
+}
+
+static int fb_prepare_extra_logos(struct fb_info *info, unsigned int height,
+				  unsigned int yres)
+{
+	unsigned int i;
+
+	/* FIXME: logo_ex supports only truecolor fb. */
+	if (info->fix.visual != FB_VISUAL_TRUECOLOR)
+		fb_logo_ex_num = 0;
+
+	for (i = 0; i < fb_logo_ex_num; i++) {
+		height += fb_logo_ex[i].logo->height;
+		if (height > yres) {
+			height -= fb_logo_ex[i].logo->height;
+			fb_logo_ex_num = i;
+			break;
+		}
+	}
+	return height;
+}
+
+static int fb_show_extra_logos(struct fb_info *info, int y, int rotate)
+{
+	unsigned int i;
+
+	for (i = 0; i < fb_logo_ex_num; i++)
+		y += fb_show_logo_line(info, rotate,
+				       fb_logo_ex[i].logo, y, fb_logo_ex[i].n);
+
+	return y;
+}
+
+#else /* !CONFIG_FB_LOGO_EXTRA */
+
+static inline int fb_prepare_extra_logos(struct fb_info *info,
+					 unsigned int height,
+					 unsigned int yres)
+{
+	return height;
+}
+
+static inline int fb_show_extra_logos(struct fb_info *info, int y, int rotate)
+{
+	return y;
+}
+
+#endif /* CONFIG_FB_LOGO_EXTRA */
+
+
+int fb_prepare_logo(struct fb_info *info, int rotate)
+{
+	int depth = fb_get_color_depth(&info->var, &info->fix);
+	unsigned int yres;
+
+	memset(&fb_logo, 0, sizeof(struct logo_data));
+
+	if (info->flags & FBINFO_MISC_TILEBLITTING ||
+	    info->flags & FBINFO_MODULE)
+		return 0;
+
+	if (info->fix.visual == FB_VISUAL_DIRECTCOLOR) {
+		depth = info->var.blue.length;
+		if (info->var.red.length < depth)
+			depth = info->var.red.length;
+		if (info->var.green.length < depth)
+			depth = info->var.green.length;
+	}
+
+	if (info->fix.visual == FB_VISUAL_STATIC_PSEUDOCOLOR && depth > 4) {
+		/* assume console colormap */
+		depth = 4;
+	}
+
+	if (depth >= 8) {
+		switch (info->fix.visual) {
+		case FB_VISUAL_TRUECOLOR:
+			fb_logo.needs_truepalette = 1;
+			break;
+		case FB_VISUAL_DIRECTCOLOR:
+			fb_logo.needs_directpalette = 1;
+			fb_logo.needs_cmapreset = 1;
+			break;
+		case FB_VISUAL_PSEUDOCOLOR:
+			fb_logo.needs_cmapreset = 1;
+			break;
+		}
+	}
+
+	/* Return if no suitable logo was found */
+	fb_logo.logo = fb_find_logo(depth);
+
+	if (!fb_logo.logo) {
+		return 0;
+	}
+
+	if (rotate == FB_ROTATE_UR || rotate == FB_ROTATE_UD)
+		yres = info->var.yres;
+	else
+		yres = info->var.xres;
+
+	if (fb_logo.logo->height > yres) {
+		fb_logo.logo = NULL;
+		return 0;
+	}
+
+	/* What depth we asked for might be different from what we get */
+	if (fb_logo.logo->type == LINUX_LOGO_CLUT224)
+		fb_logo.depth = 8;
+	else if (fb_logo.logo->type == LINUX_LOGO_VGA16)
+		fb_logo.depth = 4;
+	else
+		fb_logo.depth = 1;
+
+	return fb_prepare_extra_logos(info, fb_logo.logo->height, yres);
+}
+
 int fb_show_logo(struct fb_info *info, int rotate)
 {
-	return fb_show_logo_line(info, rotate, fb_logo.logo, 0,
-				 num_online_cpus());
+	int y;
+
+	y = fb_show_logo_line(info, rotate, fb_logo.logo, 0,
+			      num_online_cpus());
+	y = fb_show_extra_logos(info, y, rotate);
+
+	return y;
 }
 #else
 int fb_prepare_logo(struct fb_info *info, int rotate) { return 0; }
diff --git a/drivers/video/logo/Kconfig b/drivers/video/logo/Kconfig
index 9397bcef3018..5bb78e4807cd 100644
--- a/drivers/video/logo/Kconfig
+++ b/drivers/video/logo/Kconfig
@@ -10,6 +10,10 @@ menuconfig LOGO
 
 if LOGO
 
+config FB_LOGO_EXTRA
+	bool
+	depends on FB
+
 config LOGO_LINUX_MONO
 	bool "Standard black and white Linux logo"
 	default y
diff --git a/include/linux/linux_logo.h b/include/linux/linux_logo.h
index 9c01bde5bf1b..08a92969c76e 100644
--- a/include/linux/linux_logo.h
+++ b/include/linux/linux_logo.h
@@ -33,5 +33,13 @@ struct linux_logo {
 };
 
 extern const struct linux_logo *fb_find_logo(int depth);
+#ifdef CONFIG_FB_LOGO_EXTRA
+extern void fb_append_extra_logo(const struct linux_logo *logo,
+				 unsigned int n);
+#else
+static inline void fb_append_extra_logo(const struct linux_logo *logo,
+					unsigned int n)
+{}
+#endif
 
 #endif /* _LINUX_LINUX_LOGO_H */
-- 
cgit v1.3-8-gc7d7


From fe0e3a9df6372d357d3fdc4b6265a5417f1e84e8 Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@solidboot.com>
Date: Tue, 17 Jul 2007 04:05:55 -0700
Subject: OMAP: add TI OMAP1610 accelerator entry.

Signed-off-by: Trilok Soni <soni.trilok@gmail.com>
Cc: Tony Lindgren <tony@atomide.com>
Cc: "Antonino A. Daplas" <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fb.h b/include/linux/fb.h
index 8628423c6dd3..cec54106aa87 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -119,6 +119,7 @@ struct dentry;
 #define FB_ACCEL_NV_40          46      /* nVidia Arch 40               */
 #define FB_ACCEL_XGI_VOLARI_V	47	/* XGI Volari V3XT, V5, V8      */
 #define FB_ACCEL_XGI_VOLARI_Z	48	/* XGI Volari Z7                */
+#define FB_ACCEL_OMAP1610	49	/* TI OMAP16xx                  */
 #define FB_ACCEL_NEOMAGIC_NM2070 90	/* NeoMagic NM2070              */
 #define FB_ACCEL_NEOMAGIC_NM2090 91	/* NeoMagic NM2090              */
 #define FB_ACCEL_NEOMAGIC_NM2093 92	/* NeoMagic NM2093              */
-- 
cgit v1.3-8-gc7d7


From 713f6ab18b0e7d39f14401362bfe8015b1aedde1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 17 Jul 2007 04:06:12 -0700
Subject: md: improve the is_mddev_idle test fix

Don't use 'unsigned' variable to track sync vs non-sync IO, as the only thing
we want to do with them is a signed comparison, and fix up the comment which
had become quite wrong.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/md.c           | 35 ++++++++++++++++++++++-------------
 include/linux/raid/md_k.h |  2 +-
 2 files changed, 23 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3d5a6beb333b..bae42331182d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5091,7 +5091,7 @@ static int is_mddev_idle(mddev_t *mddev)
 	mdk_rdev_t * rdev;
 	struct list_head *tmp;
 	int idle;
-	unsigned long curr_events;
+	long curr_events;
 
 	idle = 1;
 	ITERATE_RDEV(mddev,rdev,tmp) {
@@ -5099,20 +5099,29 @@ static int is_mddev_idle(mddev_t *mddev)
 		curr_events = disk_stat_read(disk, sectors[0]) + 
 				disk_stat_read(disk, sectors[1]) - 
 				atomic_read(&disk->sync_io);
-		/* The difference between curr_events and last_events
-		 * will be affected by any new non-sync IO (making
-		 * curr_events bigger) and any difference in the amount of
-		 * in-flight syncio (making current_events bigger or smaller)
-		 * The amount in-flight is currently limited to
-		 * 32*64K in raid1/10 and 256*PAGE_SIZE in raid5/6
-		 * which is at most 4096 sectors.
-		 * These numbers are fairly fragile and should be made
-		 * more robust, probably by enforcing the
-		 * 'window size' that md_do_sync sort-of uses.
+		/* sync IO will cause sync_io to increase before the disk_stats
+		 * as sync_io is counted when a request starts, and
+		 * disk_stats is counted when it completes.
+		 * So resync activity will cause curr_events to be smaller than
+		 * when there was no such activity.
+		 * non-sync IO will cause disk_stat to increase without
+		 * increasing sync_io so curr_events will (eventually)
+		 * be larger than it was before.  Once it becomes
+		 * substantially larger, the test below will cause
+		 * the array to appear non-idle, and resync will slow
+		 * down.
+		 * If there is a lot of outstanding resync activity when
+		 * we set last_event to curr_events, then all that activity
+		 * completing might cause the array to appear non-idle
+		 * and resync will be slowed down even though there might
+		 * not have been non-resync activity.  This will only
+		 * happen once though.  'last_events' will soon reflect
+		 * the state where there is little or no outstanding
+		 * resync requests, and further resync activity will
+		 * always make curr_events less than last_events.
 		 *
-		 * Note: the following is an unsigned comparison.
 		 */
-		if ((long)curr_events - (long)rdev->last_events > 4096) {
+		if (curr_events - rdev->last_events > 4096) {
 			rdev->last_events = curr_events;
 			idle = 0;
 		}
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index de72c49747c8..28ac632b42dd 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -51,7 +51,7 @@ struct mdk_rdev_s
 
 	sector_t size;			/* Device size (in blocks) */
 	mddev_t *mddev;			/* RAID array if running */
-	unsigned long last_events;	/* IO event timestamp */
+	long last_events;		/* IO event timestamp */
 
 	struct block_device *bdev;	/* block device handle */
 
-- 
cgit v1.3-8-gc7d7


From 4ad1366376bfef32ec0ffa12d1faa483d6f330bd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 17 Jul 2007 04:06:13 -0700
Subject: md: change bitmap_unplug and others to void functions

bitmap_unplug only ever returns 0, so it may as well be void.  Two callers try
to print a message if it returns non-zero, but that message is already printed
by bitmap_file_kick.

write_page returns an error which is not consistently checked.  It always
causes BITMAP_WRITE_ERROR to be set on an error, and that can more
conveniently be checked.

When the return of write_page is checked, an error causes bitmap_file_kick to
be called - so move that call into write_page - and protect against recursive
calls into bitmap_file_kick.

bitmap_update_sb returns an error that is never checked.

So make these 'void' and be consistent about checking the bit.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/bitmap.c         | 140 +++++++++++++++++++++++---------------------
 drivers/md/md.c             |   3 +-
 drivers/md/raid1.c          |   3 +-
 drivers/md/raid10.c         |   3 +-
 include/linux/raid/bitmap.h |   6 +-
 5 files changed, 78 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index ae94f3beb5fc..927cb34c4805 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -305,10 +305,11 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 	return 0;
 }
 
+static void bitmap_file_kick(struct bitmap *bitmap);
 /*
  * write out a page to a file
  */
-static int write_page(struct bitmap *bitmap, struct page *page, int wait)
+static void write_page(struct bitmap *bitmap, struct page *page, int wait)
 {
 	struct buffer_head *bh;
 
@@ -316,27 +317,26 @@ static int write_page(struct bitmap *bitmap, struct page *page, int wait)
 		switch (write_sb_page(bitmap, page, wait)) {
 		case -EINVAL:
 			bitmap->flags |= BITMAP_WRITE_ERROR;
-			return -EIO;
 		}
-		return 0;
-	}
+	} else {
 
-	bh = page_buffers(page);
+		bh = page_buffers(page);
 
-	while (bh && bh->b_blocknr) {
-		atomic_inc(&bitmap->pending_writes);
-		set_buffer_locked(bh);
-		set_buffer_mapped(bh);
-		submit_bh(WRITE, bh);
-		bh = bh->b_this_page;
-	}
+		while (bh && bh->b_blocknr) {
+			atomic_inc(&bitmap->pending_writes);
+			set_buffer_locked(bh);
+			set_buffer_mapped(bh);
+			submit_bh(WRITE, bh);
+			bh = bh->b_this_page;
+		}
 
-	if (wait) {
-		wait_event(bitmap->write_wait,
-			   atomic_read(&bitmap->pending_writes)==0);
-		return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0;
+		if (wait) {
+			wait_event(bitmap->write_wait,
+				   atomic_read(&bitmap->pending_writes)==0);
+		}
 	}
-	return 0;
+	if (bitmap->flags & BITMAP_WRITE_ERROR)
+		bitmap_file_kick(bitmap);
 }
 
 static void end_bitmap_write(struct buffer_head *bh, int uptodate)
@@ -456,17 +456,17 @@ out:
  */
 
 /* update the event counter and sync the superblock to disk */
-int bitmap_update_sb(struct bitmap *bitmap)
+void bitmap_update_sb(struct bitmap *bitmap)
 {
 	bitmap_super_t *sb;
 	unsigned long flags;
 
 	if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
-		return 0;
+		return;
 	spin_lock_irqsave(&bitmap->lock, flags);
 	if (!bitmap->sb_page) { /* no superblock */
 		spin_unlock_irqrestore(&bitmap->lock, flags);
-		return 0;
+		return;
 	}
 	spin_unlock_irqrestore(&bitmap->lock, flags);
 	sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
@@ -474,7 +474,7 @@ int bitmap_update_sb(struct bitmap *bitmap)
 	if (!bitmap->mddev->degraded)
 		sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
 	kunmap_atomic(sb, KM_USER0);
-	return write_page(bitmap, bitmap->sb_page, 1);
+	write_page(bitmap, bitmap->sb_page, 1);
 }
 
 /* print out the bitmap file superblock */
@@ -603,20 +603,22 @@ enum bitmap_mask_op {
 	MASK_UNSET
 };
 
-/* record the state of the bitmap in the superblock */
-static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
-				enum bitmap_mask_op op)
+/* record the state of the bitmap in the superblock.  Return the old value */
+static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
+			     enum bitmap_mask_op op)
 {
 	bitmap_super_t *sb;
 	unsigned long flags;
+	int old;
 
 	spin_lock_irqsave(&bitmap->lock, flags);
 	if (!bitmap->sb_page) { /* can't set the state */
 		spin_unlock_irqrestore(&bitmap->lock, flags);
-		return;
+		return 0;
 	}
 	spin_unlock_irqrestore(&bitmap->lock, flags);
 	sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
+	old = le32_to_cpu(sb->state) & bits;
 	switch (op) {
 		case MASK_SET: sb->state |= cpu_to_le32(bits);
 				break;
@@ -625,6 +627,7 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
 		default: BUG();
 	}
 	kunmap_atomic(sb, KM_USER0);
+	return old;
 }
 
 /*
@@ -718,18 +721,23 @@ static void bitmap_file_kick(struct bitmap *bitmap)
 {
 	char *path, *ptr = NULL;
 
-	bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET);
-	bitmap_update_sb(bitmap);
+	if (bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET) == 0) {
+		bitmap_update_sb(bitmap);
 
-	if (bitmap->file) {
-		path = kmalloc(PAGE_SIZE, GFP_KERNEL);
-		if (path)
-			ptr = file_path(bitmap->file, path, PAGE_SIZE);
+		if (bitmap->file) {
+			path = kmalloc(PAGE_SIZE, GFP_KERNEL);
+			if (path)
+				ptr = file_path(bitmap->file, path, PAGE_SIZE);
 
-		printk(KERN_ALERT "%s: kicking failed bitmap file %s from array!\n",
-		       bmname(bitmap), ptr ? ptr : "");
+			printk(KERN_ALERT
+			      "%s: kicking failed bitmap file %s from array!\n",
+			      bmname(bitmap), ptr ? ptr : "");
 
-		kfree(path);
+			kfree(path);
+		} else
+			printk(KERN_ALERT
+			       "%s: disabling internal bitmap due to errors\n",
+			       bmname(bitmap));
 	}
 
 	bitmap_file_put(bitmap);
@@ -800,16 +808,15 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 /* this gets called when the md device is ready to unplug its underlying
  * (slave) device queues -- before we let any writes go down, we need to
  * sync the dirty pages of the bitmap file to disk */
-int bitmap_unplug(struct bitmap *bitmap)
+void bitmap_unplug(struct bitmap *bitmap)
 {
 	unsigned long i, flags;
 	int dirty, need_write;
 	struct page *page;
 	int wait = 0;
-	int err;
 
 	if (!bitmap)
-		return 0;
+		return;
 
 	/* look at each page to see if there are any set bits that need to be
 	 * flushed out to disk */
@@ -817,7 +824,7 @@ int bitmap_unplug(struct bitmap *bitmap)
 		spin_lock_irqsave(&bitmap->lock, flags);
 		if (!bitmap->filemap) {
 			spin_unlock_irqrestore(&bitmap->lock, flags);
-			return 0;
+			return;
 		}
 		page = bitmap->filemap[i];
 		dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
@@ -829,7 +836,7 @@ int bitmap_unplug(struct bitmap *bitmap)
 		spin_unlock_irqrestore(&bitmap->lock, flags);
 
 		if (dirty | need_write)
-			err = write_page(bitmap, page, 0);
+			write_page(bitmap, page, 0);
 	}
 	if (wait) { /* if any writes were performed, we need to wait on them */
 		if (bitmap->file)
@@ -840,7 +847,6 @@ int bitmap_unplug(struct bitmap *bitmap)
 	}
 	if (bitmap->flags & BITMAP_WRITE_ERROR)
 		bitmap_file_kick(bitmap);
-	return 0;
 }
 
 static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed);
@@ -889,21 +895,21 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 			bmname(bitmap),
 			(unsigned long) i_size_read(file->f_mapping->host),
 			bytes + sizeof(bitmap_super_t));
-		goto out;
+		goto err;
 	}
 
 	ret = -ENOMEM;
 
 	bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
 	if (!bitmap->filemap)
-		goto out;
+		goto err;
 
 	/* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */
 	bitmap->filemap_attr = kzalloc(
 		roundup( DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)),
 		GFP_KERNEL);
 	if (!bitmap->filemap_attr)
-		goto out;
+		goto err;
 
 	oldindex = ~0L;
 
@@ -936,7 +942,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 			}
 			if (IS_ERR(page)) { /* read error */
 				ret = PTR_ERR(page);
-				goto out;
+				goto err;
 			}
 
 			oldindex = index;
@@ -951,11 +957,13 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 				memset(paddr + offset, 0xff,
 				       PAGE_SIZE - offset);
 				kunmap_atomic(paddr, KM_USER0);
-				ret = write_page(bitmap, page, 1);
-				if (ret) {
+				write_page(bitmap, page, 1);
+
+				ret = -EIO;
+				if (bitmap->flags & BITMAP_WRITE_ERROR) {
 					/* release, page not in filemap yet */
 					put_page(page);
-					goto out;
+					goto err;
 				}
 			}
 
@@ -987,11 +995,15 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		md_wakeup_thread(bitmap->mddev->thread);
 	}
 
-out:
 	printk(KERN_INFO "%s: bitmap initialized from disk: "
-		"read %lu/%lu pages, set %lu bits, status: %d\n",
-		bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, ret);
+		"read %lu/%lu pages, set %lu bits\n",
+		bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt);
+
+	return 0;
 
+ err:
+	printk(KERN_INFO "%s: bitmap initialisation failed: %d\n",
+	       bmname(bitmap), ret);
 	return ret;
 }
 
@@ -1028,19 +1040,18 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
  *			out to disk
  */
 
-int bitmap_daemon_work(struct bitmap *bitmap)
+void bitmap_daemon_work(struct bitmap *bitmap)
 {
 	unsigned long j;
 	unsigned long flags;
 	struct page *page = NULL, *lastpage = NULL;
-	int err = 0;
 	int blocks;
 	void *paddr;
 
 	if (bitmap == NULL)
-		return 0;
+		return;
 	if (time_before(jiffies, bitmap->daemon_lastrun + bitmap->daemon_sleep*HZ))
-		return 0;
+		return;
 	bitmap->daemon_lastrun = jiffies;
 
 	for (j = 0; j < bitmap->chunks; j++) {
@@ -1063,14 +1074,8 @@ int bitmap_daemon_work(struct bitmap *bitmap)
 					clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
 
 				spin_unlock_irqrestore(&bitmap->lock, flags);
-				if (need_write) {
-					switch (write_page(bitmap, page, 0)) {
-					case 0:
-						break;
-					default:
-						bitmap_file_kick(bitmap);
-					}
-				}
+				if (need_write)
+					write_page(bitmap, page, 0);
 				continue;
 			}
 
@@ -1079,13 +1084,11 @@ int bitmap_daemon_work(struct bitmap *bitmap)
 				if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
 					clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
 					spin_unlock_irqrestore(&bitmap->lock, flags);
-					err = write_page(bitmap, lastpage, 0);
+					write_page(bitmap, lastpage, 0);
 				} else {
 					set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
 					spin_unlock_irqrestore(&bitmap->lock, flags);
 				}
-				if (err)
-					bitmap_file_kick(bitmap);
 			} else
 				spin_unlock_irqrestore(&bitmap->lock, flags);
 			lastpage = page;
@@ -1128,14 +1131,13 @@ int bitmap_daemon_work(struct bitmap *bitmap)
 		if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
 			clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
 			spin_unlock_irqrestore(&bitmap->lock, flags);
-			err = write_page(bitmap, lastpage, 0);
+			write_page(bitmap, lastpage, 0);
 		} else {
 			set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
 			spin_unlock_irqrestore(&bitmap->lock, flags);
 		}
 	}
 
-	return err;
 }
 
 static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
@@ -1548,7 +1550,9 @@ int bitmap_create(mddev_t *mddev)
 
 	mddev->thread->timeout = bitmap->daemon_sleep * HZ;
 
-	return bitmap_update_sb(bitmap);
+	bitmap_update_sb(bitmap);
+
+	return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0;
 
  error:
 	bitmap_free(bitmap);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 36c05ba7855a..65ddc887dfd7 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1640,7 +1640,6 @@ static void sync_sbs(mddev_t * mddev, int nospares)
 
 static void md_update_sb(mddev_t * mddev, int force_change)
 {
-	int err;
 	struct list_head *tmp;
 	mdk_rdev_t *rdev;
 	int sync_req;
@@ -1727,7 +1726,7 @@ repeat:
 		"md: updating %s RAID superblock on device (in sync %d)\n",
 		mdname(mddev),mddev->in_sync);
 
-	err = bitmap_update_sb(mddev->bitmap);
+	bitmap_update_sb(mddev->bitmap);
 	ITERATE_RDEV(mddev,rdev,tmp) {
 		char b[BDEVNAME_SIZE];
 		dprintk(KERN_INFO "md: ");
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 46677d7d9980..00c78b77b13d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1526,8 +1526,7 @@ static void raid1d(mddev_t *mddev)
 			blk_remove_plug(mddev->queue);
 			spin_unlock_irqrestore(&conf->device_lock, flags);
 			/* flush any pending bitmap writes to disk before proceeding w/ I/O */
-			if (bitmap_unplug(mddev->bitmap) != 0)
-				printk("%s: bitmap file write failed!\n", mdname(mddev));
+			bitmap_unplug(mddev->bitmap);
 
 			while (bio) { /* submit pending writes */
 				struct bio *next = bio->bi_next;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 9eb66c1b523b..a95ada1cfac4 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1510,8 +1510,7 @@ static void raid10d(mddev_t *mddev)
 			blk_remove_plug(mddev->queue);
 			spin_unlock_irqrestore(&conf->device_lock, flags);
 			/* flush any pending bitmap writes to disk before proceeding w/ I/O */
-			if (bitmap_unplug(mddev->bitmap) != 0)
-				printk("%s: bitmap file write failed!\n", mdname(mddev));
+			bitmap_unplug(mddev->bitmap);
 
 			while (bio) { /* submit pending writes */
 				struct bio *next = bio->bi_next;
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index dd5a05d03d4f..75e17a05540e 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -262,7 +262,7 @@ int  bitmap_active(struct bitmap *bitmap);
 
 char *file_path(struct file *file, char *buf, int count);
 void bitmap_print_sb(struct bitmap *bitmap);
-int bitmap_update_sb(struct bitmap *bitmap);
+void bitmap_update_sb(struct bitmap *bitmap);
 
 int  bitmap_setallbits(struct bitmap *bitmap);
 void bitmap_write_all(struct bitmap *bitmap);
@@ -278,8 +278,8 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int d
 void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
 void bitmap_close_sync(struct bitmap *bitmap);
 
-int bitmap_unplug(struct bitmap *bitmap);
-int bitmap_daemon_work(struct bitmap *bitmap);
+void bitmap_unplug(struct bitmap *bitmap);
+void bitmap_daemon_work(struct bitmap *bitmap);
 #endif
 
 #endif
-- 
cgit v1.3-8-gc7d7


From 3bd858ab1c451725c07a805dcb315215dc85b86e Mon Sep 17 00:00:00 2001
From: Satyam Sharma <ssatyam@cse.iitk.ac.in>
Date: Tue, 17 Jul 2007 15:00:08 +0530
Subject: Introduce is_owner_or_cap() to wrap CAP_FOWNER use with fsuid check

Introduce is_owner_or_cap() macro in fs.h, and convert over relevant
users to it. This is done because we want to avoid bugs in the future
where we check for only effective fsuid of the current task against a
file's owning uid, without simultaneously checking for CAP_FOWNER as
well, thus violating its semantics.
[ XFS uses special macros and structures, and in general looked ...
untouchable, so we leave it alone -- but it has been looked over. ]

The (current->fsuid != inode->i_uid) check in generic_permission() and
exec_permission_lite() is left alone, because those operations are
covered by CAP_DAC_OVERRIDE and CAP_DAC_READ_SEARCH. Similarly operations
falling under the purview of CAP_CHOWN and CAP_LEASE are also left alone.

Signed-off-by: Satyam Sharma <ssatyam@cse.iitk.ac.in>
Cc: Al Viro <viro@ftp.linux.org.uk>
Acked-by: Serge E. Hallyn <serge@hallyn.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/attr.c                | 4 ++--
 fs/ext2/acl.c            | 2 +-
 fs/ext2/ioctl.c          | 4 ++--
 fs/ext3/acl.c            | 2 +-
 fs/ext3/ioctl.c          | 6 +++---
 fs/ext4/acl.c            | 2 +-
 fs/ext4/ioctl.c          | 6 +++---
 fs/fcntl.c               | 2 +-
 fs/generic_acl.c         | 2 +-
 fs/gfs2/acl.c            | 2 +-
 fs/hfsplus/ioctl.c       | 2 +-
 fs/jffs2/acl.c           | 2 +-
 fs/jfs/ioctl.c           | 2 +-
 fs/jfs/xattr.c           | 2 +-
 fs/namei.c               | 2 +-
 fs/ocfs2/ioctl.c         | 2 +-
 fs/reiserfs/ioctl.c      | 5 ++---
 fs/reiserfs/xattr_acl.c  | 2 +-
 fs/utimes.c              | 2 +-
 fs/xattr.c               | 3 +--
 include/linux/fs.h       | 4 ++++
 security/selinux/hooks.c | 2 +-
 22 files changed, 32 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/attr.c b/fs/attr.c
index a0a0c7b07ba3..f8dfc2269d85 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -42,7 +42,7 @@ int inode_change_ok(struct inode *inode, struct iattr *attr)
 
 	/* Make sure a caller can chmod. */
 	if (ia_valid & ATTR_MODE) {
-		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		if (!is_owner_or_cap(inode))
 			goto error;
 		/* Also check the setgid bit! */
 		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
@@ -52,7 +52,7 @@ int inode_change_ok(struct inode *inode, struct iattr *attr)
 
 	/* Check for setting the inode time. */
 	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) {
-		if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER))
+		if (!is_owner_or_cap(inode))
 			goto error;
 	}
 fine:
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 7c420b800c34..e58669e1b87c 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -464,7 +464,7 @@ ext2_xattr_set_acl(struct inode *inode, int type, const void *value,
 
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return -EOPNOTSUPP;
-	if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+	if (!is_owner_or_cap(inode))
 		return -EPERM;
 
 	if (value) {
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index e85c48218239..3bcd25422ee4 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -36,7 +36,7 @@ int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		if (IS_RDONLY(inode))
 			return -EROFS;
 
-		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		if (!is_owner_or_cap(inode))
 			return -EACCES;
 
 		if (get_user(flags, (int __user *) arg))
@@ -74,7 +74,7 @@ int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 	case EXT2_IOC_GETVERSION:
 		return put_user(inode->i_generation, (int __user *) arg);
 	case EXT2_IOC_SETVERSION:
-		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		if (!is_owner_or_cap(inode))
 			return -EPERM;
 		if (IS_RDONLY(inode))
 			return -EROFS;
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 1e5038d9a01b..d34e9967430a 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -489,7 +489,7 @@ ext3_xattr_set_acl(struct inode *inode, int type, const void *value,
 
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return -EOPNOTSUPP;
-	if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+	if (!is_owner_or_cap(inode))
 		return -EPERM;
 
 	if (value) {
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 965006dba6be..4a2a02c95bf9 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -41,7 +41,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		if (IS_RDONLY(inode))
 			return -EROFS;
 
-		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		if (!is_owner_or_cap(inode))
 			return -EACCES;
 
 		if (get_user(flags, (int __user *) arg))
@@ -122,7 +122,7 @@ flags_err:
 		__u32 generation;
 		int err;
 
-		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		if (!is_owner_or_cap(inode))
 			return -EPERM;
 		if (IS_RDONLY(inode))
 			return -EROFS;
@@ -181,7 +181,7 @@ flags_err:
 		if (IS_RDONLY(inode))
 			return -EROFS;
 
-		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		if (!is_owner_or_cap(inode))
 			return -EACCES;
 
 		if (get_user(rsv_window_size, (int __user *)arg))
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 9e882546d91a..a8bae8cd1d5d 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -489,7 +489,7 @@ ext4_xattr_set_acl(struct inode *inode, int type, const void *value,
 
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return -EOPNOTSUPP;
-	if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+	if (!is_owner_or_cap(inode))
 		return -EPERM;
 
 	if (value) {
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 500567dd53b6..7b4aa4543c83 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -40,7 +40,7 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		if (IS_RDONLY(inode))
 			return -EROFS;
 
-		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		if (!is_owner_or_cap(inode))
 			return -EACCES;
 
 		if (get_user(flags, (int __user *) arg))
@@ -121,7 +121,7 @@ flags_err:
 		__u32 generation;
 		int err;
 
-		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		if (!is_owner_or_cap(inode))
 			return -EPERM;
 		if (IS_RDONLY(inode))
 			return -EROFS;
@@ -180,7 +180,7 @@ flags_err:
 		if (IS_RDONLY(inode))
 			return -EROFS;
 
-		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		if (!is_owner_or_cap(inode))
 			return -EACCES;
 
 		if (get_user(rsv_window_size, (int __user *)arg))
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 8e382a5d51bd..3f22e9f4f691 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -215,7 +215,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 
 	/* O_NOATIME can only be set by the owner or superuser */
 	if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
-		if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER))
+		if (!is_owner_or_cap(inode))
 			return -EPERM;
 
 	/* required for strict SunOS emulation */
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 9ccb78947171..995d63b2e747 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -78,7 +78,7 @@ generic_acl_set(struct inode *inode, struct generic_acl_operations *ops,
 
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
-	if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER))
+	if (!is_owner_or_cap(inode))
 		return -EPERM;
 	if (value) {
 		acl = posix_acl_from_xattr(value, size);
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 6e80844367ee..1047a8c7226a 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -74,7 +74,7 @@ int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
 {
 	if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
 		return -EOPNOTSUPP;
-	if (current->fsuid != ip->i_inode.i_uid && !capable(CAP_FOWNER))
+	if (!is_owner_or_cap(&ip->i_inode))
 		return -EPERM;
 	if (S_ISLNK(ip->i_inode.i_mode))
 		return -EOPNOTSUPP;
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 79fd10402ea3..b60c0affbec5 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -38,7 +38,7 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 		if (IS_RDONLY(inode))
 			return -EROFS;
 
-		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		if (!is_owner_or_cap(inode))
 			return -EACCES;
 
 		if (get_user(flags, (int __user *)arg))
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index a46101ee867a..65b3a1b5b88d 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -435,7 +435,7 @@ static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value,
 	struct posix_acl *acl;
 	int rc;
 
-	if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+	if (!is_owner_or_cap(inode))
 		return -EPERM;
 
 	if (value) {
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index fe063af6fd2f..3c8663bea98c 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -69,7 +69,7 @@ int jfs_ioctl(struct inode * inode, struct file * filp, unsigned int cmd,
 		if (IS_RDONLY(inode))
 			return -EROFS;
 
-		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		if (!is_owner_or_cap(inode))
 			return -EACCES;
 
 		if (get_user(flags, (int __user *) arg))
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index b2375f0774b7..9b7f2cdaae0a 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -697,7 +697,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
 	struct posix_acl *acl;
 	int rc;
 
-	if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+	if (!is_owner_or_cap(inode))
 		return -EPERM;
 
 	/*
diff --git a/fs/namei.c b/fs/namei.c
index 5e2d98d10c5d..defaa47c11d4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1576,7 +1576,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 
 	/* O_NOATIME can only be set by the owner or superuser */
 	if (flag & O_NOATIME)
-		if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER))
+		if (!is_owner_or_cap(inode))
 			return -EPERM;
 
 	/*
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index bd68c3f2afbe..87dcece7e1b5 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -63,7 +63,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
 		goto bail_unlock;
 
 	status = -EACCES;
-	if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+	if (!is_owner_or_cap(inode))
 		goto bail_unlock;
 
 	if (!S_ISDIR(inode->i_mode))
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index b484d2913c0d..11a0fcc2d402 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -51,8 +51,7 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 			if (IS_RDONLY(inode))
 				return -EROFS;
 
-			if ((current->fsuid != inode->i_uid)
-			    && !capable(CAP_FOWNER))
+			if (!is_owner_or_cap(inode))
 				return -EPERM;
 
 			if (get_user(flags, (int __user *)arg))
@@ -81,7 +80,7 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 	case REISERFS_IOC_GETVERSION:
 		return put_user(inode->i_generation, (int __user *)arg);
 	case REISERFS_IOC_SETVERSION:
-		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		if (!is_owner_or_cap(inode))
 			return -EPERM;
 		if (IS_RDONLY(inode))
 			return -EROFS;
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 5296a29cc5eb..b7e4fa4539de 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -21,7 +21,7 @@ xattr_set_acl(struct inode *inode, int type, const void *value, size_t size)
 
 	if (!reiserfs_posixacl(inode->i_sb))
 		return -EOPNOTSUPP;
-	if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+	if (!is_owner_or_cap(inode))
 		return -EPERM;
 
 	if (value) {
diff --git a/fs/utimes.c b/fs/utimes.c
index 83a7e69e706c..682eb63b20ad 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -106,7 +106,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
                 if (IS_IMMUTABLE(inode))
                         goto dput_and_out;
 
-		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) {
+		if (!is_owner_or_cap(inode)) {
 			if (f) {
 				if (!(f->f_mode & FMODE_WRITE))
 					goto dput_and_out;
diff --git a/fs/xattr.c b/fs/xattr.c
index 4523aca79659..a44fd92caca3 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -60,8 +60,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
 			return -EPERM;
 		if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
-		    (mask & MAY_WRITE) && (current->fsuid != inode->i_uid) &&
-		    !capable(CAP_FOWNER))
+		    (mask & MAY_WRITE) && !is_owner_or_cap(inode))
 			return -EPERM;
 	}
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 58ce336d4a6b..98205f680476 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -284,6 +284,7 @@ extern int dir_notify_enable;
 #include <linux/pid.h>
 #include <linux/mutex.h>
 #include <linux/sysctl.h>
+#include <linux/capability.h>
 
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
@@ -990,6 +991,9 @@ enum {
 #define put_fs_excl() atomic_dec(&current->fs_excl)
 #define has_fs_excl() atomic_read(&current->fs_excl)
 
+#define is_owner_or_cap(inode)	\
+	((current->fsuid == (inode)->i_uid) || capable(CAP_FOWNER))
+
 /* not quite ready to be deprecated, but... */
 extern void lock_super(struct super_block *);
 extern void unlock_super(struct super_block *);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 78c3f98fcdcf..520b9998123e 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2318,7 +2318,7 @@ static int selinux_inode_setxattr(struct dentry *dentry, char *name, void *value
 	if (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)
 		return -EOPNOTSUPP;
 
-	if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+	if (!is_owner_or_cap(inode))
 		return -EPERM;
 
 	AVC_AUDIT_DATA_INIT(&ad,FS);
-- 
cgit v1.3-8-gc7d7


From 8dfd588c3180b7403c402b4545164ee4543f8f86 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Tue, 17 Jul 2007 22:29:46 +0100
Subject: smp_call_function_single() should be a macro on UP

... or we end up with header include order problems from hell.

E.g. on m68k this is 100% fatal - local_irq_enable() there
wants preempt_count(), which wants task_struct fields, which
we won't have when we are in smp.h pulled from sched.h.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/smp.h | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 8039daced688..259a13c3bd98 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -7,7 +7,6 @@
  */
 
 #include <linux/errno.h>
-#include <asm/system.h>
 
 extern void cpu_idle(void);
 
@@ -100,15 +99,14 @@ static inline int up_smp_call_function(void)
 static inline void smp_send_reschedule(int cpu) { }
 #define num_booting_cpus()			1
 #define smp_prepare_boot_cpu()			do {} while (0)
-static inline int smp_call_function_single(int cpuid, void (*func) (void *info),
-					   void *info, int retry, int wait)
-{
-	WARN_ON(cpuid != 0);
-	local_irq_disable();
-	func(info);
-	local_irq_enable();
-	return 0;
-}
+#define smp_call_function_single(cpuid, func, info, retry, wait) \
+({ \
+	WARN_ON(cpuid != 0);	\
+	local_irq_disable();	\
+	(func)(info);		\
+	local_irq_enable();	\
+	0;			\
+})
 
 #endif /* !SMP */
 
-- 
cgit v1.3-8-gc7d7


From cb32da0416b823b7f4b65e7e85d6cba16ca4d1e1 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 18 Jul 2007 09:18:36 +0900
Subject: slob: Kill off duplicate kzalloc() definition.

With the slab zeroing allocations cleanups Christoph stubbed in a generic
kzalloc(), which was missed on SLOB. Follow the SLAB/SLUB changes and
kill off the __kzalloc() wrapper that SLOB was using.

Reported-by: Jan Engelhardt <jengelh@computergmbh.de>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slob_def.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slob_def.h b/include/linux/slob_def.h
index a2daf2d418a9..59a3fa476ab9 100644
--- a/include/linux/slob_def.h
+++ b/include/linux/slob_def.h
@@ -33,14 +33,4 @@ static inline void *__kmalloc(size_t size, gfp_t flags)
 	return kmalloc(size, flags);
 }
 
-/**
- * kzalloc - allocate memory. The memory is set to zero.
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate (see kcalloc).
- */
-static inline void *kzalloc(size_t size, gfp_t flags)
-{
-	return __kzalloc(size, flags);
-}
-
 #endif /* __LINUX_SLOB_DEF_H */
-- 
cgit v1.3-8-gc7d7


From 97ac73506c0ba93f30239bb57b4cfc5d73e68a62 Mon Sep 17 00:00:00 2001
From: Amit Arora <aarora@in.ibm.com>
Date: Tue, 17 Jul 2007 21:42:44 -0400
Subject: sys_fallocate() implementation on i386, x86_64 and powerpc

fallocate() is a new system call being proposed here which will allow
applications to preallocate space to any file(s) in a file system.
Each file system implementation that wants to use this feature will need
to support an inode operation called ->fallocate().
Applications can use this feature to avoid fragmentation to certain
level and thus get faster access speed. With preallocation, applications
also get a guarantee of space for particular file(s) - even if later the
the system becomes full.

Currently, glibc provides an interface called posix_fallocate() which
can be used for similar cause. Though this has the advantage of working
on all file systems, but it is quite slow (since it writes zeroes to
each block that has to be preallocated). Without a doubt, file systems
can do this more efficiently within the kernel, by implementing
the proposed fallocate() system call. It is expected that
posix_fallocate() will be modified to call this new system call first
and incase the kernel/filesystem does not implement it, it should fall
back to the current implementation of writing zeroes to the new blocks.
ToDos:
1. Implementation on other architectures (other than i386, x86_64,
   and ppc). Patches for s390(x) and ia64 are already available from
   previous posts, but it was decided that they should be added later
   once fallocate is in the mainline. Hence not including those patches
   in this take.
2. Changes to glibc,
   a) to support fallocate() system call
   b) to make posix_fallocate() and posix_fallocate64() call fallocate()

Signed-off-by: Amit Arora <aarora@in.ibm.com>
---
 arch/i386/kernel/syscall_table.S |  1 +
 arch/powerpc/kernel/sys_ppc32.c  |  7 +++++
 arch/x86_64/ia32/ia32entry.S     |  1 +
 arch/x86_64/ia32/sys_ia32.c      |  8 ++++++
 fs/open.c                        | 59 ++++++++++++++++++++++++++++++++++++++++
 include/asm-i386/unistd.h        |  3 +-
 include/asm-powerpc/systbl.h     |  1 +
 include/asm-powerpc/unistd.h     |  3 +-
 include/asm-x86_64/unistd.h      |  2 ++
 include/linux/falloc.h           |  6 ++++
 include/linux/fs.h               |  2 ++
 include/linux/syscalls.h         |  1 +
 12 files changed, 92 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/falloc.h

(limited to 'include/linux')

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index bf6adce52267..8344c70adf61 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -323,3 +323,4 @@ ENTRY(sys_call_table)
 	.long sys_signalfd
 	.long sys_timerfd
 	.long sys_eventfd
+	.long sys_fallocate
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index b42cbf1e2d7d..bd85b5fd08c8 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -773,6 +773,13 @@ asmlinkage int compat_sys_truncate64(const char __user * path, u32 reg4,
 	return sys_truncate(path, (high << 32) | low);
 }
 
+asmlinkage long compat_sys_fallocate(int fd, int mode, u32 offhi, u32 offlo,
+				     u32 lenhi, u32 lenlo)
+{
+	return sys_fallocate(fd, mode, ((loff_t)offhi << 32) | offlo,
+			     ((loff_t)lenhi << 32) | lenlo);
+}
+
 asmlinkage int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long high,
 				 unsigned long low)
 {
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 782dea819438..3f66e970d86f 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -719,4 +719,5 @@ ia32_sys_call_table:
 	.quad compat_sys_signalfd
 	.quad compat_sys_timerfd
 	.quad sys_eventfd
+	.quad sys32_fallocate
 ia32_syscall_end:
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index 99a78a3cce7c..bee96d614432 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -879,3 +879,11 @@ asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi,
 	return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo,
 				len, advice);
 }
+
+asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo,
+				unsigned offset_hi, unsigned len_lo,
+				unsigned len_hi)
+{
+	return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
+			     ((u64)len_hi << 32) | len_lo);
+}
diff --git a/fs/open.c b/fs/open.c
index be6a457f4226..a6b054edacba 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -26,6 +26,7 @@
 #include <linux/syscalls.h>
 #include <linux/rcupdate.h>
 #include <linux/audit.h>
+#include <linux/falloc.h>
 
 int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
@@ -352,6 +353,64 @@ asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length)
 }
 #endif
 
+asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
+{
+	struct file *file;
+	struct inode *inode;
+	long ret = -EINVAL;
+
+	if (offset < 0 || len <= 0)
+		goto out;
+
+	/* Return error if mode is not supported */
+	ret = -EOPNOTSUPP;
+	if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
+		goto out;
+
+	ret = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+	if (!(file->f_mode & FMODE_WRITE))
+		goto out_fput;
+	/*
+	 * Revalidate the write permissions, in case security policy has
+	 * changed since the files were opened.
+	 */
+	ret = security_file_permission(file, MAY_WRITE);
+	if (ret)
+		goto out_fput;
+
+	inode = file->f_path.dentry->d_inode;
+
+	ret = -ESPIPE;
+	if (S_ISFIFO(inode->i_mode))
+		goto out_fput;
+
+	ret = -ENODEV;
+	/*
+	 * Let individual file system decide if it supports preallocation
+	 * for directories or not.
+	 */
+	if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+		goto out_fput;
+
+	ret = -EFBIG;
+	/* Check for wrap through zero too */
+	if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
+		goto out_fput;
+
+	if (inode->i_op && inode->i_op->fallocate)
+		ret = inode->i_op->fallocate(inode, mode, offset, len);
+	else
+		ret = -ENOSYS;
+
+out_fput:
+	fput(file);
+out:
+	return ret;
+}
+
 /*
  * access() needs to use the real uid/gid, not the effective uid/gid.
  * We do this by temporarily clearing all FS-related capabilities and
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index e84ace1ec8bf..9b15545eb9b5 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -329,10 +329,11 @@
 #define __NR_signalfd		321
 #define __NR_timerfd		322
 #define __NR_eventfd		323
+#define __NR_fallocate		324
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 324
+#define NR_syscalls 325
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-powerpc/systbl.h b/include/asm-powerpc/systbl.h
index 1cc3f9cb6f4e..cc6d87228258 100644
--- a/include/asm-powerpc/systbl.h
+++ b/include/asm-powerpc/systbl.h
@@ -308,6 +308,7 @@ COMPAT_SYS_SPU(move_pages)
 SYSCALL_SPU(getcpu)
 COMPAT_SYS(epoll_pwait)
 COMPAT_SYS_SPU(utimensat)
+COMPAT_SYS(fallocate)
 COMPAT_SYS_SPU(signalfd)
 COMPAT_SYS_SPU(timerfd)
 SYSCALL_SPU(eventfd)
diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h
index f71c6061f1ec..97d82b6a9406 100644
--- a/include/asm-powerpc/unistd.h
+++ b/include/asm-powerpc/unistd.h
@@ -331,10 +331,11 @@
 #define __NR_timerfd		306
 #define __NR_eventfd		307
 #define __NR_sync_file_range2	308
+#define __NR_fallocate		309
 
 #ifdef __KERNEL__
 
-#define __NR_syscalls		309
+#define __NR_syscalls		310
 
 #define __NR__exit __NR_exit
 #define NR_syscalls	__NR_syscalls
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 8696f8ad401e..fc4e73f5f1fa 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -630,6 +630,8 @@ __SYSCALL(__NR_signalfd, sys_signalfd)
 __SYSCALL(__NR_timerfd, sys_timerfd)
 #define __NR_eventfd		284
 __SYSCALL(__NR_eventfd, sys_eventfd)
+#define __NR_fallocate		285
+__SYSCALL(__NR_fallocate, sys_fallocate)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/linux/falloc.h b/include/linux/falloc.h
new file mode 100644
index 000000000000..8e912ab6a072
--- /dev/null
+++ b/include/linux/falloc.h
@@ -0,0 +1,6 @@
+#ifndef _FALLOC_H_
+#define _FALLOC_H_
+
+#define FALLOC_FL_KEEP_SIZE	0x01 /* default is extend size */
+
+#endif /* _FALLOC_H_ */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98205f680476..0b806c5e32eb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1147,6 +1147,8 @@ struct inode_operations {
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
 	void (*truncate_range)(struct inode *, loff_t, loff_t);
+	long (*fallocate)(struct inode *inode, int mode, loff_t offset,
+			  loff_t len);
 };
 
 struct seq_file;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 83d0ec11235e..7a8b1e3322e0 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -610,6 +610,7 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
 asmlinkage long sys_timerfd(int ufd, int clockid, int flags,
 			    const struct itimerspec __user *utmr);
 asmlinkage long sys_eventfd(unsigned int count);
+asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
-- 
cgit v1.3-8-gc7d7


From a2df2a63407803a833f82e1fa6693826c8c9d584 Mon Sep 17 00:00:00 2001
From: Amit Arora <aarora@in.ibm.com>
Date: Tue, 17 Jul 2007 21:42:41 -0400
Subject: fallocate support in ext4

This patch implements ->fallocate() inode operation in ext4. With this
patch users of ext4 file systems will be able to use fallocate() system
call for persistent preallocation. Current implementation only supports
preallocation for regular files (directories not supported as of date)
with extent maps. This patch does not support block-mapped files currently.
Only FALLOC_ALLOCATE and FALLOC_RESV_SPACE modes are being supported as of
now.

Signed-off-by: Amit Arora <aarora@in.ibm.com>
---
 fs/ext4/extents.c               | 249 +++++++++++++++++++++++++++++++++-------
 fs/ext4/file.c                  |   1 +
 include/linux/ext4_fs.h         |   8 ++
 include/linux/ext4_fs_extents.h |  15 +++
 4 files changed, 232 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b9ce24129070..ba25832a756c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -39,6 +39,7 @@
 #include <linux/quotaops.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/falloc.h>
 #include <linux/ext4_fs_extents.h>
 #include <asm/uaccess.h>
 
@@ -282,7 +283,7 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
 		} else if (path->p_ext) {
 			ext_debug("  %d:%d:%llu ",
 				  le32_to_cpu(path->p_ext->ee_block),
-				  le16_to_cpu(path->p_ext->ee_len),
+				  ext4_ext_get_actual_len(path->p_ext),
 				  ext_pblock(path->p_ext));
 		} else
 			ext_debug("  []");
@@ -305,7 +306,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 
 	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
 		ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
-			  le16_to_cpu(ex->ee_len), ext_pblock(ex));
+			  ext4_ext_get_actual_len(ex), ext_pblock(ex));
 	}
 	ext_debug("\n");
 }
@@ -425,7 +426,7 @@ ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
 	ext_debug("  -> %d:%llu:%d ",
 			le32_to_cpu(path->p_ext->ee_block),
 			ext_pblock(path->p_ext),
-			le16_to_cpu(path->p_ext->ee_len));
+			ext4_ext_get_actual_len(path->p_ext));
 
 #ifdef CHECK_BINSEARCH
 	{
@@ -686,7 +687,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		ext_debug("move %d:%llu:%d in new leaf %llu\n",
 				le32_to_cpu(path[depth].p_ext->ee_block),
 				ext_pblock(path[depth].p_ext),
-				le16_to_cpu(path[depth].p_ext->ee_len),
+				ext4_ext_get_actual_len(path[depth].p_ext),
 				newblock);
 		/*memmove(ex++, path[depth].p_ext++,
 				sizeof(struct ext4_extent));
@@ -1106,7 +1107,19 @@ static int
 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 				struct ext4_extent *ex2)
 {
-	if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) !=
+	unsigned short ext1_ee_len, ext2_ee_len;
+
+	/*
+	 * Make sure that either both extents are uninitialized, or
+	 * both are _not_.
+	 */
+	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
+		return 0;
+
+	ext1_ee_len = ext4_ext_get_actual_len(ex1);
+	ext2_ee_len = ext4_ext_get_actual_len(ex2);
+
+	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
 			le32_to_cpu(ex2->ee_block))
 		return 0;
 
@@ -1115,14 +1128,14 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 	 * as an RO_COMPAT feature, refuse to merge to extents if
 	 * this can result in the top bit of ee_len being set.
 	 */
-	if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN)
+	if (ext1_ee_len + ext2_ee_len > EXT_MAX_LEN)
 		return 0;
 #ifdef AGGRESSIVE_TEST
 	if (le16_to_cpu(ex1->ee_len) >= 4)
 		return 0;
 #endif
 
-	if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2))
+	if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
 		return 1;
 	return 0;
 }
@@ -1144,7 +1157,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
 	unsigned int ret = 0;
 
 	b1 = le32_to_cpu(newext->ee_block);
-	len1 = le16_to_cpu(newext->ee_len);
+	len1 = ext4_ext_get_actual_len(newext);
 	depth = ext_depth(inode);
 	if (!path[depth].p_ext)
 		goto out;
@@ -1191,8 +1204,9 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	struct ext4_extent *nearex; /* nearest extent */
 	struct ext4_ext_path *npath = NULL;
 	int depth, len, err, next;
+	unsigned uninitialized = 0;
 
-	BUG_ON(newext->ee_len == 0);
+	BUG_ON(ext4_ext_get_actual_len(newext) == 0);
 	depth = ext_depth(inode);
 	ex = path[depth].p_ext;
 	BUG_ON(path[depth].p_hdr == NULL);
@@ -1200,14 +1214,24 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	/* try to insert block into found extent and return */
 	if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
 		ext_debug("append %d block to %d:%d (from %llu)\n",
-				le16_to_cpu(newext->ee_len),
+				ext4_ext_get_actual_len(newext),
 				le32_to_cpu(ex->ee_block),
-				le16_to_cpu(ex->ee_len), ext_pblock(ex));
+				ext4_ext_get_actual_len(ex), ext_pblock(ex));
 		err = ext4_ext_get_access(handle, inode, path + depth);
 		if (err)
 			return err;
-		ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len)
-					 + le16_to_cpu(newext->ee_len));
+
+		/*
+		 * ext4_can_extents_be_merged should have checked that either
+		 * both extents are uninitialized, or both aren't. Thus we
+		 * need to check only one of them here.
+		 */
+		if (ext4_ext_is_uninitialized(ex))
+			uninitialized = 1;
+		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+					+ ext4_ext_get_actual_len(newext));
+		if (uninitialized)
+			ext4_ext_mark_uninitialized(ex);
 		eh = path[depth].p_hdr;
 		nearex = ex;
 		goto merge;
@@ -1263,7 +1287,7 @@ has_space:
 		ext_debug("first extent in the leaf: %d:%llu:%d\n",
 				le32_to_cpu(newext->ee_block),
 				ext_pblock(newext),
-				le16_to_cpu(newext->ee_len));
+				ext4_ext_get_actual_len(newext));
 		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
 	} else if (le32_to_cpu(newext->ee_block)
 			   > le32_to_cpu(nearex->ee_block)) {
@@ -1276,7 +1300,7 @@ has_space:
 					"move %d from 0x%p to 0x%p\n",
 					le32_to_cpu(newext->ee_block),
 					ext_pblock(newext),
-					le16_to_cpu(newext->ee_len),
+					ext4_ext_get_actual_len(newext),
 					nearex, len, nearex + 1, nearex + 2);
 			memmove(nearex + 2, nearex + 1, len);
 		}
@@ -1289,7 +1313,7 @@ has_space:
 				"move %d from 0x%p to 0x%p\n",
 				le32_to_cpu(newext->ee_block),
 				ext_pblock(newext),
-				le16_to_cpu(newext->ee_len),
+				ext4_ext_get_actual_len(newext),
 				nearex, len, nearex + 1, nearex + 2);
 		memmove(nearex + 1, nearex, len);
 		path[depth].p_ext = nearex;
@@ -1308,8 +1332,13 @@ merge:
 		if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1))
 			break;
 		/* merge with next extent! */
-		nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len)
-					     + le16_to_cpu(nearex[1].ee_len));
+		if (ext4_ext_is_uninitialized(nearex))
+			uninitialized = 1;
+		nearex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(nearex)
+					+ ext4_ext_get_actual_len(nearex + 1));
+		if (uninitialized)
+			ext4_ext_mark_uninitialized(nearex);
+
 		if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
 			len = (EXT_LAST_EXTENT(eh) - nearex - 1)
 					* sizeof(struct ext4_extent);
@@ -1379,8 +1408,8 @@ int ext4_ext_walk_space(struct inode *inode, unsigned long block,
 			end = le32_to_cpu(ex->ee_block);
 			if (block + num < end)
 				end = block + num;
-		} else if (block >=
-			     le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) {
+		} else if (block >= le32_to_cpu(ex->ee_block)
+					+ ext4_ext_get_actual_len(ex)) {
 			/* need to allocate space after found extent */
 			start = block;
 			end = block + num;
@@ -1392,7 +1421,8 @@ int ext4_ext_walk_space(struct inode *inode, unsigned long block,
 			 * by found extent
 			 */
 			start = block;
-			end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len);
+			end = le32_to_cpu(ex->ee_block)
+				+ ext4_ext_get_actual_len(ex);
 			if (block + num < end)
 				end = block + num;
 			exists = 1;
@@ -1408,7 +1438,7 @@ int ext4_ext_walk_space(struct inode *inode, unsigned long block,
 			cbex.ec_type = EXT4_EXT_CACHE_GAP;
 		} else {
 			cbex.ec_block = le32_to_cpu(ex->ee_block);
-			cbex.ec_len = le16_to_cpu(ex->ee_len);
+			cbex.ec_len = ext4_ext_get_actual_len(ex);
 			cbex.ec_start = ext_pblock(ex);
 			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
 		}
@@ -1481,15 +1511,15 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 		ext_debug("cache gap(before): %lu [%lu:%lu]",
 				(unsigned long) block,
 				(unsigned long) le32_to_cpu(ex->ee_block),
-				(unsigned long) le16_to_cpu(ex->ee_len));
+				(unsigned long) ext4_ext_get_actual_len(ex));
 	} else if (block >= le32_to_cpu(ex->ee_block)
-			    + le16_to_cpu(ex->ee_len)) {
+			+ ext4_ext_get_actual_len(ex)) {
 		lblock = le32_to_cpu(ex->ee_block)
-			 + le16_to_cpu(ex->ee_len);
+			+ ext4_ext_get_actual_len(ex);
 		len = ext4_ext_next_allocated_block(path);
 		ext_debug("cache gap(after): [%lu:%lu] %lu",
 				(unsigned long) le32_to_cpu(ex->ee_block),
-				(unsigned long) le16_to_cpu(ex->ee_len),
+				(unsigned long) ext4_ext_get_actual_len(ex),
 				(unsigned long) block);
 		BUG_ON(len == lblock);
 		len = len - lblock;
@@ -1619,12 +1649,12 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 				unsigned long from, unsigned long to)
 {
 	struct buffer_head *bh;
+	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
 	int i;
 
 #ifdef EXTENTS_STATS
 	{
 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-		unsigned short ee_len =  le16_to_cpu(ex->ee_len);
 		spin_lock(&sbi->s_ext_stats_lock);
 		sbi->s_ext_blocks += ee_len;
 		sbi->s_ext_extents++;
@@ -1638,12 +1668,12 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 	}
 #endif
 	if (from >= le32_to_cpu(ex->ee_block)
-	    && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
 		/* tail removal */
 		unsigned long num;
 		ext4_fsblk_t start;
-		num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from;
-		start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num;
+		num = le32_to_cpu(ex->ee_block) + ee_len - from;
+		start = ext_pblock(ex) + ee_len - num;
 		ext_debug("free last %lu blocks starting %llu\n", num, start);
 		for (i = 0; i < num; i++) {
 			bh = sb_find_get_block(inode->i_sb, start + i);
@@ -1651,12 +1681,12 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 		}
 		ext4_free_blocks(handle, inode, start, num);
 	} else if (from == le32_to_cpu(ex->ee_block)
-		   && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
 		printk("strange request: removal %lu-%lu from %u:%u\n",
-		       from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+			from, to, le32_to_cpu(ex->ee_block), ee_len);
 	} else {
 		printk("strange request: removal(2) %lu-%lu from %u:%u\n",
-		       from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+			from, to, le32_to_cpu(ex->ee_block), ee_len);
 	}
 	return 0;
 }
@@ -1671,6 +1701,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	unsigned a, b, block, num;
 	unsigned long ex_ee_block;
 	unsigned short ex_ee_len;
+	unsigned uninitialized = 0;
 	struct ext4_extent *ex;
 
 	ext_debug("truncate since %lu in leaf\n", start);
@@ -1685,7 +1716,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	ex = EXT_LAST_EXTENT(eh);
 
 	ex_ee_block = le32_to_cpu(ex->ee_block);
-	ex_ee_len = le16_to_cpu(ex->ee_len);
+	if (ext4_ext_is_uninitialized(ex))
+		uninitialized = 1;
+	ex_ee_len = ext4_ext_get_actual_len(ex);
 
 	while (ex >= EXT_FIRST_EXTENT(eh) &&
 			ex_ee_block + ex_ee_len > start) {
@@ -1753,6 +1786,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 
 		ex->ee_block = cpu_to_le32(block);
 		ex->ee_len = cpu_to_le16(num);
+		if (uninitialized)
+			ext4_ext_mark_uninitialized(ex);
 
 		err = ext4_ext_dirty(handle, inode, path + depth);
 		if (err)
@@ -1762,7 +1797,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 				ext_pblock(ex));
 		ex--;
 		ex_ee_block = le32_to_cpu(ex->ee_block);
-		ex_ee_len = le16_to_cpu(ex->ee_len);
+		ex_ee_len = ext4_ext_get_actual_len(ex);
 	}
 
 	if (correct_index && eh->eh_entries)
@@ -2038,7 +2073,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	if (ex) {
 		unsigned long ee_block = le32_to_cpu(ex->ee_block);
 		ext4_fsblk_t ee_start = ext_pblock(ex);
-		unsigned short ee_len  = le16_to_cpu(ex->ee_len);
+		unsigned short ee_len;
 
 		/*
 		 * Allow future support for preallocated extents to be added
@@ -2046,8 +2081,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		 * Uninitialized extents are treated as holes, except that
 		 * we avoid (fail) allocating new blocks during a write.
 		 */
-		if (ee_len > EXT_MAX_LEN)
+		if (le16_to_cpu(ex->ee_len) > EXT_MAX_LEN)
 			goto out2;
+		ee_len = ext4_ext_get_actual_len(ex);
 		/* if found extent covers block, simply return it */
 		if (iblock >= ee_block && iblock < ee_block + ee_len) {
 			newblock = iblock - ee_block + ee_start;
@@ -2055,8 +2091,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			allocated = ee_len - (iblock - ee_block);
 			ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
 					ee_block, ee_len, newblock);
-			ext4_ext_put_in_cache(inode, ee_block, ee_len,
-						ee_start, EXT4_EXT_CACHE_EXTENT);
+			/* Do not put uninitialized extent in the cache */
+			if (!ext4_ext_is_uninitialized(ex))
+				ext4_ext_put_in_cache(inode, ee_block,
+							ee_len, ee_start,
+							EXT4_EXT_CACHE_EXTENT);
 			goto out;
 		}
 	}
@@ -2098,6 +2137,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	/* try to insert new extent into found leaf and return */
 	ext4_ext_store_pblock(&newex, newblock);
 	newex.ee_len = cpu_to_le16(allocated);
+	if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
+		ext4_ext_mark_uninitialized(&newex);
 	err = ext4_ext_insert_extent(handle, inode, path, &newex);
 	if (err) {
 		/* free data blocks we just allocated */
@@ -2113,8 +2154,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	newblock = ext_pblock(&newex);
 	__set_bit(BH_New, &bh_result->b_state);
 
-	ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
-				EXT4_EXT_CACHE_EXTENT);
+	/* Cache only when it is _not_ an uninitialized extent */
+	if (create != EXT4_CREATE_UNINITIALIZED_EXT)
+		ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+						EXT4_EXT_CACHE_EXTENT);
 out:
 	if (allocated > max_blocks)
 		allocated = max_blocks;
@@ -2217,3 +2260,127 @@ int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
 
 	return needed;
 }
+
+/*
+ * preallocate space for a file. This implements ext4's fallocate inode
+ * operation, which gets called from sys_fallocate system call.
+ * For block-mapped files, posix_fallocate should fall back to the method
+ * of writing zeroes to the required new blocks (the same behavior which is
+ * expected for file systems which do not support fallocate() system call).
+ */
+long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
+{
+	handle_t *handle;
+	ext4_fsblk_t block, max_blocks;
+	ext4_fsblk_t nblocks = 0;
+	int ret = 0;
+	int ret2 = 0;
+	int retries = 0;
+	struct buffer_head map_bh;
+	unsigned int credits, blkbits = inode->i_blkbits;
+
+	/*
+	 * currently supporting (pre)allocate mode for extent-based
+	 * files _only_
+	 */
+	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+		return -EOPNOTSUPP;
+
+	/* preallocation to directories is currently not supported */
+	if (S_ISDIR(inode->i_mode))
+		return -ENODEV;
+
+	block = offset >> blkbits;
+	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
+			- block;
+
+	/*
+	 * credits to insert 1 extent into extent tree + buffers to be able to
+	 * modify 1 super block, 1 block bitmap and 1 group descriptor.
+	 */
+	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
+retry:
+	while (ret >= 0 && ret < max_blocks) {
+		block = block + ret;
+		max_blocks = max_blocks - ret;
+		handle = ext4_journal_start(inode, credits);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			break;
+		}
+
+		ret = ext4_ext_get_blocks(handle, inode, block,
+					  max_blocks, &map_bh,
+					  EXT4_CREATE_UNINITIALIZED_EXT, 0);
+		WARN_ON(!ret);
+		if (!ret) {
+			ext4_error(inode->i_sb, "ext4_fallocate",
+				   "ext4_ext_get_blocks returned 0! inode#%lu"
+				   ", block=%llu, max_blocks=%llu",
+				   inode->i_ino, block, max_blocks);
+			ret = -EIO;
+			ext4_mark_inode_dirty(handle, inode);
+			ret2 = ext4_journal_stop(handle);
+			break;
+		}
+		if (ret > 0) {
+			/* check wrap through sign-bit/zero here */
+			if ((block + ret) < 0 || (block + ret) < block) {
+				ret = -EIO;
+				ext4_mark_inode_dirty(handle, inode);
+				ret2 = ext4_journal_stop(handle);
+				break;
+			}
+			if (buffer_new(&map_bh) && ((block + ret) >
+			    (EXT4_BLOCK_ALIGN(i_size_read(inode), blkbits)
+			    >> blkbits)))
+					nblocks = nblocks + ret;
+		}
+
+		/* Update ctime if new blocks get allocated */
+		if (nblocks) {
+			struct timespec now;
+
+			now = current_fs_time(inode->i_sb);
+			if (!timespec_equal(&inode->i_ctime, &now))
+				inode->i_ctime = now;
+		}
+
+		ext4_mark_inode_dirty(handle, inode);
+		ret2 = ext4_journal_stop(handle);
+		if (ret2)
+			break;
+	}
+
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+
+	/*
+	 * Time to update the file size.
+	 * Update only when preallocation was requested beyond the file size.
+	 */
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	    (offset + len) > i_size_read(inode)) {
+		if (ret > 0) {
+			/*
+			 * if no error, we assume preallocation succeeded
+			 * completely
+			 */
+			mutex_lock(&inode->i_mutex);
+			i_size_write(inode, offset + len);
+			EXT4_I(inode)->i_disksize = i_size_read(inode);
+			mutex_unlock(&inode->i_mutex);
+		} else if (ret < 0 && nblocks) {
+			/* Handle partial allocation scenario */
+			loff_t newsize;
+
+			mutex_lock(&inode->i_mutex);
+			newsize  = (nblocks << blkbits) + i_size_read(inode);
+			i_size_write(inode, EXT4_BLOCK_ALIGN(newsize, blkbits));
+			EXT4_I(inode)->i_disksize = i_size_read(inode);
+			mutex_unlock(&inode->i_mutex);
+		}
+	}
+
+	return ret > 0 ? ret2 : ret;
+}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d4c8186aed64..1a81cd66d63b 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -134,5 +134,6 @@ const struct inode_operations ext4_file_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.permission	= ext4_permission,
+	.fallocate	= ext4_fallocate,
 };
 
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h
index de1f9f78625a..87c2d7a05b01 100644
--- a/include/linux/ext4_fs.h
+++ b/include/linux/ext4_fs.h
@@ -102,6 +102,7 @@
 				 EXT4_GOOD_OLD_FIRST_INO : \
 				 (s)->s_first_ino)
 #endif
+#define EXT4_BLOCK_ALIGN(size, blkbits)		ALIGN((size), (1 << (blkbits)))
 
 /*
  * Macro-instructions used to manage fragments
@@ -225,6 +226,11 @@ struct ext4_new_group_data {
 	__u32 free_blocks_count;
 };
 
+/*
+ * Following is used by preallocation code to tell get_blocks() that we
+ * want uninitialzed extents.
+ */
+#define EXT4_CREATE_UNINITIALIZED_EXT		2
 
 /*
  * ioctl commands
@@ -983,6 +989,8 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 extern void ext4_ext_truncate(struct inode *, struct page *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
+extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
+			  loff_t len);
 static inline int
 ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 			unsigned long max_blocks, struct buffer_head *bh,
diff --git a/include/linux/ext4_fs_extents.h b/include/linux/ext4_fs_extents.h
index acfe59740b03..e3d5afc6f23e 100644
--- a/include/linux/ext4_fs_extents.h
+++ b/include/linux/ext4_fs_extents.h
@@ -188,6 +188,21 @@ ext4_ext_invalidate_cache(struct inode *inode)
 	EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO;
 }
 
+static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
+{
+	ext->ee_len |= cpu_to_le16(0x8000);
+}
+
+static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext)
+{
+	return (int)(le16_to_cpu((ext)->ee_len) & 0x8000);
+}
+
+static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
+{
+	return (int)(le16_to_cpu((ext)->ee_len) & 0x7FFF);
+}
+
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *);
 extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
-- 
cgit v1.3-8-gc7d7


From 56055d3ae4cc7fa6d2b10885f20269de8a989ed7 Mon Sep 17 00:00:00 2001
From: Amit Arora <aarora@in.ibm.com>
Date: Tue, 17 Jul 2007 21:42:38 -0400
Subject: write support for preallocated blocks

This patch adds write support to the uninitialized extents that get
created when a preallocation is done using fallocate(). It takes care of
splitting the extents into multiple (upto three) extents and merging the
new split extents with neighbouring ones, if possible.

Signed-off-by: Amit Arora <aarora@in.ibm.com>
---
 fs/ext4/extents.c               | 254 +++++++++++++++++++++++++++++++++++-----
 include/linux/ext4_fs_extents.h |   3 +
 2 files changed, 225 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ba25832a756c..ded3d469f978 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1140,6 +1140,53 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 	return 0;
 }
 
+/*
+ * This function tries to merge the "ex" extent to the next extent in the tree.
+ * It always tries to merge towards right. If you want to merge towards
+ * left, pass "ex - 1" as argument instead of "ex".
+ * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
+ * 1 if they got merged.
+ */
+int ext4_ext_try_to_merge(struct inode *inode,
+			  struct ext4_ext_path *path,
+			  struct ext4_extent *ex)
+{
+	struct ext4_extent_header *eh;
+	unsigned int depth, len;
+	int merge_done = 0;
+	int uninitialized = 0;
+
+	depth = ext_depth(inode);
+	BUG_ON(path[depth].p_hdr == NULL);
+	eh = path[depth].p_hdr;
+
+	while (ex < EXT_LAST_EXTENT(eh)) {
+		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
+			break;
+		/* merge with next extent! */
+		if (ext4_ext_is_uninitialized(ex))
+			uninitialized = 1;
+		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+				+ ext4_ext_get_actual_len(ex + 1));
+		if (uninitialized)
+			ext4_ext_mark_uninitialized(ex);
+
+		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
+			len = (EXT_LAST_EXTENT(eh) - ex - 1)
+				* sizeof(struct ext4_extent);
+			memmove(ex + 1, ex + 2, len);
+		}
+		eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries) - 1);
+		merge_done = 1;
+		WARN_ON(eh->eh_entries == 0);
+		if (!eh->eh_entries)
+			ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
+			   "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
+	}
+
+	return merge_done;
+}
+
 /*
  * check if a portion of the "newext" extent overlaps with an
  * existing extent.
@@ -1328,25 +1375,7 @@ has_space:
 
 merge:
 	/* try to merge extents to the right */
-	while (nearex < EXT_LAST_EXTENT(eh)) {
-		if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1))
-			break;
-		/* merge with next extent! */
-		if (ext4_ext_is_uninitialized(nearex))
-			uninitialized = 1;
-		nearex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(nearex)
-					+ ext4_ext_get_actual_len(nearex + 1));
-		if (uninitialized)
-			ext4_ext_mark_uninitialized(nearex);
-
-		if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
-			len = (EXT_LAST_EXTENT(eh) - nearex - 1)
-					* sizeof(struct ext4_extent);
-			memmove(nearex + 1, nearex + 2, len);
-		}
-		eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
-		BUG_ON(eh->eh_entries == 0);
-	}
+	ext4_ext_try_to_merge(inode, path, nearex);
 
 	/* try to merge extents to the left */
 
@@ -2012,15 +2041,158 @@ void ext4_ext_release(struct super_block *sb)
 #endif
 }
 
+/*
+ * This function is called by ext4_ext_get_blocks() if someone tries to write
+ * to an uninitialized extent. It may result in splitting the uninitialized
+ * extent into multiple extents (upto three - one initialized and two
+ * uninitialized).
+ * There are three possibilities:
+ *   a> There is no split required: Entire extent should be initialized
+ *   b> Splits in two extents: Write is happening at either end of the extent
+ *   c> Splits in three extents: Somone is writing in middle of the extent
+ */
+int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
+					struct ext4_ext_path *path,
+					ext4_fsblk_t iblock,
+					unsigned long max_blocks)
+{
+	struct ext4_extent *ex, newex;
+	struct ext4_extent *ex1 = NULL;
+	struct ext4_extent *ex2 = NULL;
+	struct ext4_extent *ex3 = NULL;
+	struct ext4_extent_header *eh;
+	unsigned int allocated, ee_block, ee_len, depth;
+	ext4_fsblk_t newblock;
+	int err = 0;
+	int ret = 0;
+
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+	allocated = ee_len - (iblock - ee_block);
+	newblock = iblock - ee_block + ext_pblock(ex);
+	ex2 = ex;
+
+	/* ex1: ee_block to iblock - 1 : uninitialized */
+	if (iblock > ee_block) {
+		ex1 = ex;
+		ex1->ee_len = cpu_to_le16(iblock - ee_block);
+		ext4_ext_mark_uninitialized(ex1);
+		ex2 = &newex;
+	}
+	/*
+	 * for sanity, update the length of the ex2 extent before
+	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
+	 * overlap of blocks.
+	 */
+	if (!ex1 && allocated > max_blocks)
+		ex2->ee_len = cpu_to_le16(max_blocks);
+	/* ex3: to ee_block + ee_len : uninitialised */
+	if (allocated > max_blocks) {
+		unsigned int newdepth;
+		ex3 = &newex;
+		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
+		ext4_ext_store_pblock(ex3, newblock + max_blocks);
+		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+		ext4_ext_mark_uninitialized(ex3);
+		err = ext4_ext_insert_extent(handle, inode, path, ex3);
+		if (err)
+			goto out;
+		/*
+		 * The depth, and hence eh & ex might change
+		 * as part of the insert above.
+		 */
+		newdepth = ext_depth(inode);
+		if (newdepth != depth) {
+			depth = newdepth;
+			path = ext4_ext_find_extent(inode, iblock, NULL);
+			if (IS_ERR(path)) {
+				err = PTR_ERR(path);
+				path = NULL;
+				goto out;
+			}
+			eh = path[depth].p_hdr;
+			ex = path[depth].p_ext;
+			if (ex2 != &newex)
+				ex2 = ex;
+		}
+		allocated = max_blocks;
+	}
+	/*
+	 * If there was a change of depth as part of the
+	 * insertion of ex3 above, we need to update the length
+	 * of the ex1 extent again here
+	 */
+	if (ex1 && ex1 != ex) {
+		ex1 = ex;
+		ex1->ee_len = cpu_to_le16(iblock - ee_block);
+		ext4_ext_mark_uninitialized(ex1);
+		ex2 = &newex;
+	}
+	/* ex2: iblock to iblock + maxblocks-1 : initialised */
+	ex2->ee_block = cpu_to_le32(iblock);
+	ex2->ee_start = cpu_to_le32(newblock);
+	ext4_ext_store_pblock(ex2, newblock);
+	ex2->ee_len = cpu_to_le16(allocated);
+	if (ex2 != ex)
+		goto insert;
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		goto out;
+	/*
+	 * New (initialized) extent starts from the first block
+	 * in the current extent. i.e., ex2 == ex
+	 * We have to see if it can be merged with the extent
+	 * on the left.
+	 */
+	if (ex2 > EXT_FIRST_EXTENT(eh)) {
+		/*
+		 * To merge left, pass "ex2 - 1" to try_to_merge(),
+		 * since it merges towards right _only_.
+		 */
+		ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
+		if (ret) {
+			err = ext4_ext_correct_indexes(handle, inode, path);
+			if (err)
+				goto out;
+			depth = ext_depth(inode);
+			ex2--;
+		}
+	}
+	/*
+	 * Try to Merge towards right. This might be required
+	 * only when the whole extent is being written to.
+	 * i.e. ex2 == ex and ex3 == NULL.
+	 */
+	if (!ex3) {
+		ret = ext4_ext_try_to_merge(inode, path, ex2);
+		if (ret) {
+			err = ext4_ext_correct_indexes(handle, inode, path);
+			if (err)
+				goto out;
+		}
+	}
+	/* Mark modified extent as dirty */
+	err = ext4_ext_dirty(handle, inode, path + depth);
+	goto out;
+insert:
+	err = ext4_ext_insert_extent(handle, inode, path, &newex);
+out:
+	return err ? err : allocated;
+}
+
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t iblock,
 			unsigned long max_blocks, struct buffer_head *bh_result,
 			int create, int extend_disksize)
 {
 	struct ext4_ext_path *path = NULL;
+	struct ext4_extent_header *eh;
 	struct ext4_extent newex, *ex;
 	ext4_fsblk_t goal, newblock;
-	int err = 0, depth;
+	int err = 0, depth, ret;
 	unsigned long allocated = 0;
 
 	__clear_bit(BH_New, &bh_result->b_state);
@@ -2033,8 +2205,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	if (goal) {
 		if (goal == EXT4_EXT_CACHE_GAP) {
 			if (!create) {
-				/* block isn't allocated yet and
-				 * user doesn't want to allocate it */
+				/*
+				 * block isn't allocated yet and
+				 * user doesn't want to allocate it
+				 */
 				goto out2;
 			}
 			/* we should allocate requested block */
@@ -2068,6 +2242,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * this is why assert can't be put in ext4_ext_find_extent()
 	 */
 	BUG_ON(path[depth].p_ext == NULL && depth != 0);
+	eh = path[depth].p_hdr;
 
 	ex = path[depth].p_ext;
 	if (ex) {
@@ -2076,13 +2251,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		unsigned short ee_len;
 
 		/*
-		 * Allow future support for preallocated extents to be added
-		 * as an RO_COMPAT feature:
 		 * Uninitialized extents are treated as holes, except that
-		 * we avoid (fail) allocating new blocks during a write.
+		 * we split out initialized portions during a write.
 		 */
-		if (le16_to_cpu(ex->ee_len) > EXT_MAX_LEN)
-			goto out2;
 		ee_len = ext4_ext_get_actual_len(ex);
 		/* if found extent covers block, simply return it */
 		if (iblock >= ee_block && iblock < ee_block + ee_len) {
@@ -2091,12 +2262,27 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			allocated = ee_len - (iblock - ee_block);
 			ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
 					ee_block, ee_len, newblock);
+
 			/* Do not put uninitialized extent in the cache */
-			if (!ext4_ext_is_uninitialized(ex))
+			if (!ext4_ext_is_uninitialized(ex)) {
 				ext4_ext_put_in_cache(inode, ee_block,
 							ee_len, ee_start,
 							EXT4_EXT_CACHE_EXTENT);
-			goto out;
+				goto out;
+			}
+			if (create == EXT4_CREATE_UNINITIALIZED_EXT)
+				goto out;
+			if (!create)
+				goto out2;
+
+			ret = ext4_ext_convert_to_initialized(handle, inode,
+								path, iblock,
+								max_blocks);
+			if (ret <= 0)
+				goto out2;
+			else
+				allocated = ret;
+			goto outnew;
 		}
 	}
 
@@ -2105,8 +2291,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * we couldn't try to create block if create flag is zero
 	 */
 	if (!create) {
-		/* put just found gap into cache to speed up
-		 * subsequent requests */
+		/*
+		 * put just found gap into cache to speed up
+		 * subsequent requests
+		 */
 		ext4_ext_put_gap_in_cache(inode, path, iblock);
 		goto out2;
 	}
@@ -2152,6 +2340,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 
 	/* previous routine could use block we allocated */
 	newblock = ext_pblock(&newex);
+outnew:
 	__set_bit(BH_New, &bh_result->b_state);
 
 	/* Cache only when it is _not_ an uninitialized extent */
@@ -2221,7 +2410,8 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
 	err = ext4_ext_remove_space(inode, last_block);
 
 	/* In a multi-transaction truncate, we only make the final
-	 * transaction synchronous. */
+	 * transaction synchronous.
+	 */
 	if (IS_SYNC(inode))
 		handle->h_sync = 1;
 
diff --git a/include/linux/ext4_fs_extents.h b/include/linux/ext4_fs_extents.h
index e3d5afc6f23e..edf49ec89eac 100644
--- a/include/linux/ext4_fs_extents.h
+++ b/include/linux/ext4_fs_extents.h
@@ -205,6 +205,9 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
 
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *);
+extern int ext4_ext_try_to_merge(struct inode *inode,
+				 struct ext4_ext_path *path,
+				 struct ext4_extent *);
 extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
 extern int ext4_ext_walk_space(struct inode *, unsigned long, unsigned long, ext_prepare_callback, void *);
-- 
cgit v1.3-8-gc7d7


From e4f48861993294c27849076741eb0c090482560b Mon Sep 17 00:00:00 2001
From: Semih Hazar <semih.hazar@indefia.com>
Date: Wed, 18 Jul 2007 00:35:56 -0400
Subject: Input: ads7846 - introduce sample settling delay

The ads7846 driver has support for filtering, but when the chip gets
deselected between samples this causes noise. This patch adds support
for an optional settling delay time, so that two consecutive samples
will be taken with the specified delay time apart.  This ensures that
the chip won't be deselected, so the noise won't appear.

Filtering can still be done, but will have less work to do since each
time a new sample is taken the same delay applies.

Signed-off-by: Semih Hazar <semih.hazar@indefia.com>
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/touchscreen/ads7846.c | 65 ++++++++++++++++++++++++++++++++++++-
 include/linux/spi/ads7846.h         |  8 +++++
 2 files changed, 72 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c
index 1c9069cd3bae..103ee6ad299e 100644
--- a/drivers/input/touchscreen/ads7846.c
+++ b/drivers/input/touchscreen/ads7846.c
@@ -95,7 +95,7 @@ struct ads7846 {
 	u16			dummy;		/* for the pwrdown read */
 	struct ts_event		tc;
 
-	struct spi_transfer	xfer[10];
+	struct spi_transfer	xfer[18];
 	struct spi_message	msg[5];
 	struct spi_message	*last_msg;
 	int			msg_idx;
@@ -936,6 +936,24 @@ static int __devinit ads7846_probe(struct spi_device *spi)
 	x->len = 2;
 	spi_message_add_tail(x, m);
 
+	/* the first sample after switching drivers can be low quality;
+	 * optionally discard it, using a second one after the signals
+	 * have had enough time to stabilize.
+	 */
+	if (pdata->settle_delay_usecs) {
+		x->delay_usecs = pdata->settle_delay_usecs;
+
+		x++;
+		x->tx_buf = &ts->read_y;
+		x->len = 1;
+		spi_message_add_tail(x, m);
+
+		x++;
+		x->rx_buf = &ts->tc.y;
+		x->len = 2;
+		spi_message_add_tail(x, m);
+	}
+
 	m->complete = ads7846_rx_val;
 	m->context = ts;
 
@@ -954,6 +972,21 @@ static int __devinit ads7846_probe(struct spi_device *spi)
 	x->len = 2;
 	spi_message_add_tail(x, m);
 
+	/* ... maybe discard first sample ... */
+	if (pdata->settle_delay_usecs) {
+		x->delay_usecs = pdata->settle_delay_usecs;
+
+		x++;
+		x->tx_buf = &ts->read_x;
+		x->len = 1;
+		spi_message_add_tail(x, m);
+
+		x++;
+		x->rx_buf = &ts->tc.x;
+		x->len = 2;
+		spi_message_add_tail(x, m);
+	}
+
 	m->complete = ads7846_rx_val;
 	m->context = ts;
 
@@ -973,6 +1006,21 @@ static int __devinit ads7846_probe(struct spi_device *spi)
 		x->len = 2;
 		spi_message_add_tail(x, m);
 
+		/* ... maybe discard first sample ... */
+		if (pdata->settle_delay_usecs) {
+			x->delay_usecs = pdata->settle_delay_usecs;
+
+			x++;
+			x->tx_buf = &ts->read_z1;
+			x->len = 1;
+			spi_message_add_tail(x, m);
+
+			x++;
+			x->rx_buf = &ts->tc.z1;
+			x->len = 2;
+			spi_message_add_tail(x, m);
+		}
+
 		m->complete = ads7846_rx_val;
 		m->context = ts;
 
@@ -990,6 +1038,21 @@ static int __devinit ads7846_probe(struct spi_device *spi)
 		x->len = 2;
 		spi_message_add_tail(x, m);
 
+		/* ... maybe discard first sample ... */
+		if (pdata->settle_delay_usecs) {
+			x->delay_usecs = pdata->settle_delay_usecs;
+
+			x++;
+			x->tx_buf = &ts->read_z2;
+			x->len = 1;
+			spi_message_add_tail(x, m);
+
+			x++;
+			x->rx_buf = &ts->tc.z2;
+			x->len = 2;
+			spi_message_add_tail(x, m);
+		}
+
 		m->complete = ads7846_rx_val;
 		m->context = ts;
 	}
diff --git a/include/linux/spi/ads7846.h b/include/linux/spi/ads7846.h
index 3387e44dfd13..a44fa7a02bd9 100644
--- a/include/linux/spi/ads7846.h
+++ b/include/linux/spi/ads7846.h
@@ -16,6 +16,14 @@ struct ads7846_platform_data {
 	u16	vref_delay_usecs;	/* 0 for external vref; etc */
 	int	keep_vref_on:1;		/* set to keep vref on for differential
 					 * measurements as well */
+
+	/* Settling time of the analog signals; a function of Vcc and the
+	 * capacitance on the X/Y drivers.  If set to non-zero, two samples
+	 * are taken with settle_delay us apart, and the second one is used.
+	 * ~150 uSec with 0.01uF caps.
+	 */
+	u16	settle_delay_usecs;
+
 	u16	x_plate_ohms;
 	u16	y_plate_ohms;
 
-- 
cgit v1.3-8-gc7d7


From 1d25891f3241103d14ea78236504474a138b8ada Mon Sep 17 00:00:00 2001
From: Semih Hazar <semih.hazar@indefia.com>
Date: Wed, 18 Jul 2007 00:36:04 -0400
Subject: Input: ads7846 - re-check pendown status before reporting events

Pendown status from the PENIRQ pin is currently read only at the beginning
of a sample set. If the pen is lifted just after sampling has began then
sampled values become wrong.

This patch adds an optional platform penirq_recheck_delay attribute.  If
non-zero, samples are only reported to the input subsystem if PENIRQ is
still active that long after the samples taken.

Signed-off-by: Semih Hazar <semih.hazar@indefia.com>
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/touchscreen/ads7846.c | 15 +++++++++++++++
 include/linux/spi/ads7846.h         |  6 ++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c
index 103ee6ad299e..96581d08774f 100644
--- a/drivers/input/touchscreen/ads7846.c
+++ b/drivers/input/touchscreen/ads7846.c
@@ -107,6 +107,8 @@ struct ads7846 {
 	u16			debounce_tol;
 	u16			debounce_rep;
 
+	u16			penirq_recheck_delay_usecs;
+
 	spinlock_t		lock;
 	struct hrtimer		timer;
 	unsigned		pendown:1;	/* P: lock */
@@ -553,6 +555,15 @@ static void ads7846_rx(void *ads)
 		return;
 	}
 
+	/* Maybe check the pendown state before reporting. This discards
+	 * false readings when the pen is lifted.
+	 */
+	if (ts->penirq_recheck_delay_usecs) {
+		udelay(ts->penirq_recheck_delay_usecs);
+		if (!ts->get_pendown_state())
+			Rt = 0;
+	}
+
 	/* NOTE: We can't rely on the pressure to determine the pen down
 	 * state, even this controller has a pressure sensor.  The pressure
 	 * value can fluctuate for quite a while after lifting the pen and
@@ -896,6 +907,10 @@ static int __devinit ads7846_probe(struct spi_device *spi)
 		ts->filter = ads7846_no_filter;
 	ts->get_pendown_state = pdata->get_pendown_state;
 
+	if (pdata->penirq_recheck_delay_usecs)
+		ts->penirq_recheck_delay_usecs =
+				pdata->penirq_recheck_delay_usecs;
+
 	snprintf(ts->phys, sizeof(ts->phys), "%s/input0", spi->dev.bus_id);
 
 	input_dev->name = "ADS784x Touchscreen";
diff --git a/include/linux/spi/ads7846.h b/include/linux/spi/ads7846.h
index a44fa7a02bd9..334d31411629 100644
--- a/include/linux/spi/ads7846.h
+++ b/include/linux/spi/ads7846.h
@@ -24,6 +24,12 @@ struct ads7846_platform_data {
 	 */
 	u16	settle_delay_usecs;
 
+	/* If set to non-zero, after samples are taken this delay is applied
+	 * and penirq is rechecked, to help avoid false events.  This value
+	 * is affected by the material used to build the touch layer.
+	 */
+	u16	penirq_recheck_delay_usecs;
+
 	u16	x_plate_ohms;
 	u16	y_plate_ohms;
 
-- 
cgit v1.3-8-gc7d7


From 85f202d5df877f8adcda342b74ab11fbdfea753d Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor@insightbb.com>
Date: Wed, 18 Jul 2007 00:37:01 -0400
Subject: Input: add driver for Fujitsu serial touchscreens

These serial touchscreens are found on some Fujitsu lifebook
P-series laptops, and the B6210. Using this requires a new
version of inputattach and doing:

 inputattach -fjt /dev/ttyS0

Big thanks to Stephen Hemminger for testing it and making it
work on his B6210 laptop.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/touchscreen/Kconfig      |  13 +++
 drivers/input/touchscreen/Makefile     |   1 +
 drivers/input/touchscreen/fujitsu_ts.c | 189 +++++++++++++++++++++++++++++++++
 include/linux/serio.h                  |   1 +
 4 files changed, 204 insertions(+)
 create mode 100644 drivers/input/touchscreen/fujitsu_ts.c

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig
index 69371779806a..f929fcdbae2e 100644
--- a/drivers/input/touchscreen/Kconfig
+++ b/drivers/input/touchscreen/Kconfig
@@ -54,6 +54,19 @@ config TOUCHSCREEN_CORGI
 	  To compile this driver as a module, choose M here: the
 	  module will be called corgi_ts.
 
+config TOUCHSCREEN_FUJITSU
+	tristate "Fujitsu serial touchscreen"
+	select SERIO
+	help
+	  Say Y here if you have the Fujitsu touchscreen (such as one
+	  installed in Lifebook P series laptop) connected to your
+	  system.
+
+	  If unsure, say N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called fujitsu-ts.
+
 config TOUCHSCREEN_GUNZE
 	tristate "Gunze AHL-51S touchscreen"
 	select SERIO
diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile
index 2f86d6ad06d3..5de8933c4993 100644
--- a/drivers/input/touchscreen/Makefile
+++ b/drivers/input/touchscreen/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_TOUCHSCREEN_BITSY)		+= h3600_ts_input.o
 obj-$(CONFIG_TOUCHSCREEN_CORGI)		+= corgi_ts.o
 obj-$(CONFIG_TOUCHSCREEN_GUNZE)		+= gunze.o
 obj-$(CONFIG_TOUCHSCREEN_ELO)		+= elo.o
+obj-$(CONFIG_TOUCHSCREEN_FUJITSU)	+= fujitsu_ts.o
 obj-$(CONFIG_TOUCHSCREEN_MTOUCH)	+= mtouch.o
 obj-$(CONFIG_TOUCHSCREEN_MK712)		+= mk712.o
 obj-$(CONFIG_TOUCHSCREEN_HP600)		+= hp680_ts_input.o
diff --git a/drivers/input/touchscreen/fujitsu_ts.c b/drivers/input/touchscreen/fujitsu_ts.c
new file mode 100644
index 000000000000..daf7a4afc935
--- /dev/null
+++ b/drivers/input/touchscreen/fujitsu_ts.c
@@ -0,0 +1,189 @@
+/*
+ * Fujitsu serial touchscreen driver
+ *
+ * Copyright (c) Dmitry Torokhov <dtor@mail.ru>
+ */
+
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/input.h>
+#include <linux/serio.h>
+#include <linux/init.h>
+
+#define DRIVER_DESC	"Fujitsu serial touchscreen driver"
+
+MODULE_AUTHOR("Dmitry Torokhov <dtor@mail.ru>");
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_LICENSE("GPL");
+
+#define FUJITSU_LENGTH 5
+
+/*
+ * Per-touchscreen data.
+ */
+struct fujitsu {
+	struct input_dev *dev;
+	struct serio *serio;
+	int idx;
+	unsigned char data[FUJITSU_LENGTH];
+	char phys[32];
+};
+
+/*
+ * Decode serial data (5 bytes per packet)
+ * First byte
+ * 1 C 0 0 R S S S
+ * Where C is 1 while in calibration mode (which we don't use)
+ * R is 1 when no coordinate corection was done.
+ * S are button state
+ */
+static irqreturn_t fujitsu_interrupt(struct serio *serio,
+				     unsigned char data, unsigned int flags)
+{
+	struct fujitsu *fujitsu = serio_get_drvdata(serio);
+	struct input_dev *dev = fujitsu->dev;
+
+	if (fujitsu->idx == 0) {
+		/* resync skip until start of frame */
+		if ((data & 0xf0) != 0x80)
+			return IRQ_HANDLED;
+	} else {
+		/* resync skip garbage */
+		if (data & 0x80) {
+			fujitsu->idx = 0;
+			return IRQ_HANDLED;
+		}
+	}
+
+	fujitsu->data[fujitsu->idx++] = data;
+	if (fujitsu->idx == FUJITSU_LENGTH) {
+		input_report_abs(dev, ABS_X,
+				 (fujitsu->data[2] << 7) | fujitsu->data[1]);
+		input_report_abs(dev, ABS_Y,
+				 (fujitsu->data[4] << 7) | fujitsu->data[3]);
+		input_report_key(dev, BTN_TOUCH,
+				 (fujitsu->data[0] & 0x03) != 2);
+		input_sync(dev);
+		fujitsu->idx = 0;
+	}
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * fujitsu_disconnect() is the opposite of fujitsu_connect()
+ */
+static void fujitsu_disconnect(struct serio *serio)
+{
+	struct fujitsu *fujitsu = serio_get_drvdata(serio);
+
+	input_get_device(fujitsu->dev);
+	input_unregister_device(fujitsu->dev);
+	serio_close(serio);
+	serio_set_drvdata(serio, NULL);
+	input_put_device(fujitsu->dev);
+	kfree(fujitsu);
+}
+
+/*
+ * fujitsu_connect() is the routine that is called when someone adds a
+ * new serio device that supports the Fujitsu protocol and registers it
+ * as input device.
+ */
+static int fujitsu_connect(struct serio *serio, struct serio_driver *drv)
+{
+	struct fujitsu *fujitsu;
+	struct input_dev *input_dev;
+	int err;
+
+	fujitsu = kzalloc(sizeof(struct fujitsu), GFP_KERNEL);
+	input_dev = input_allocate_device();
+	if (!fujitsu || !input_dev) {
+		err = -ENOMEM;
+		goto fail1;
+	}
+
+	fujitsu->serio = serio;
+	fujitsu->dev = input_dev;
+	snprintf(fujitsu->phys, sizeof(fujitsu->phys),
+		 "%s/input0", serio->phys);
+
+	input_dev->name = "Fujitsu Serial Touchscreen";
+	input_dev->phys = fujitsu->phys;
+	input_dev->id.bustype = BUS_RS232;
+	input_dev->id.vendor = SERIO_FUJITSU;
+	input_dev->id.product = 0;
+	input_dev->id.version = 0x0100;
+	input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS);
+	input_dev->keybit[LONG(BTN_TOUCH)] = BIT(BTN_TOUCH);
+
+	input_set_abs_params(input_dev, ABS_X, 0, 4096, 0, 0);
+	input_set_abs_params(input_dev, ABS_Y, 0, 4096, 0, 0);
+	serio_set_drvdata(serio, fujitsu);
+
+	err = serio_open(serio, drv);
+	if (err)
+		goto fail2;
+
+	err = input_register_device(fujitsu->dev);
+	if (err)
+		goto fail3;
+
+	return 0;
+
+ fail3:
+	serio_close(serio);
+ fail2:
+	serio_set_drvdata(serio, NULL);
+ fail1:
+	input_free_device(input_dev);
+	kfree(fujitsu);
+	return err;
+}
+
+/*
+ * The serio driver structure.
+ */
+static struct serio_device_id fujitsu_serio_ids[] = {
+	{
+		.type	= SERIO_RS232,
+		.proto	= SERIO_FUJITSU,
+		.id	= SERIO_ANY,
+		.extra	= SERIO_ANY,
+	},
+	{ 0 }
+};
+
+MODULE_DEVICE_TABLE(serio, fujitsu_serio_ids);
+
+static struct serio_driver fujitsu_drv = {
+	.driver		= {
+		.name	= "fujitsu_ts",
+	},
+	.description	= DRIVER_DESC,
+	.id_table	= fujitsu_serio_ids,
+	.interrupt	= fujitsu_interrupt,
+	.connect	= fujitsu_connect,
+	.disconnect	= fujitsu_disconnect,
+};
+
+static int __init fujitsu_init(void)
+{
+	return serio_register_driver(&fujitsu_drv);
+}
+
+static void __exit fujitsu_exit(void)
+{
+	serio_unregister_driver(&fujitsu_drv);
+}
+
+module_init(fujitsu_init);
+module_exit(fujitsu_exit);
diff --git a/include/linux/serio.h b/include/linux/serio.h
index d9377ce9ffd1..9f3825014674 100644
--- a/include/linux/serio.h
+++ b/include/linux/serio.h
@@ -210,5 +210,6 @@ static inline void serio_unpin_driver(struct serio *serio)
 #define SERIO_TOUCHRIGHT	0x32
 #define SERIO_TOUCHWIN	0x33
 #define SERIO_TAOSEVM	0x34
+#define SERIO_FUJITSU	0x35
 
 #endif
-- 
cgit v1.3-8-gc7d7


From 5517853712f1f6daac8a7b2590f9b821e767aa13 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor@insightbb.com>
Date: Wed, 18 Jul 2007 00:38:45 -0400
Subject: Input: document intended meaning of KEY_SWITCHVIDEOMODE

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 include/linux/input.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/input.h b/include/linux/input.h
index 18c98b543030..e02c6a66b2ba 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -344,7 +344,8 @@ struct input_absinfo {
 #define KEY_BRIGHTNESSUP	225
 #define KEY_MEDIA		226
 
-#define KEY_SWITCHVIDEOMODE	227
+#define KEY_SWITCHVIDEOMODE	227	/* Cycle between available video
+					   outputs (Monitor/LCD/TV-out/etc) */
 #define KEY_KBDILLUMTOGGLE	228
 #define KEY_KBDILLUMDOWN	229
 #define KEY_KBDILLUMUP		230
-- 
cgit v1.3-8-gc7d7


From 456ad75c89cdb72e11dcdb6b0794802a6f50c8a3 Mon Sep 17 00:00:00 2001
From: Denis Cheng <crquan@gmail.com>
Date: Wed, 18 Jul 2007 02:10:54 -0700
Subject: [NET]: move dev_mc_discard from dev_mcast.c to dev.c

Because this function is only called by unregister_netdevice,
this moving could make this non-global function static,
and also remove its declaration in netdevice.h;

Any further, function __dev_addr_discard is also just called by
dev_mc_discard and dev_unicast_discard, keeping this two functions
both in one c file could make __dev_addr_discard also static
and remove its declaration in netdevice.h;

Futhermore, the sequential call to dev_unicast_discard and then
dev_mc_discard in unregister_netdevice have a similar mechanism that:
(netif_tx_lock_bh / __dev_addr_discard / netif_tx_unlock_bh),
they should merged into one to eliminate duplicates in acquiring and
releasing the dev->_xmit_lock, this would be done in my following patch.

Signed-off-by: Denis Cheng <crquan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 --
 net/core/dev.c            | 14 +++++++++++++-
 net/core/dev_mcast.c      | 12 ------------
 3 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index da7a13c97eb8..9820ca1e45e2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1098,10 +1098,8 @@ extern int 		dev_mc_delete(struct net_device *dev, void *addr, int alen, int all
 extern int		dev_mc_add(struct net_device *dev, void *addr, int alen, int newonly);
 extern int		dev_mc_sync(struct net_device *to, struct net_device *from);
 extern void		dev_mc_unsync(struct net_device *to, struct net_device *from);
-extern void		dev_mc_discard(struct net_device *dev);
 extern int 		__dev_addr_delete(struct dev_addr_list **list, int *count, void *addr, int alen, int all);
 extern int		__dev_addr_add(struct dev_addr_list **list, int *count, void *addr, int alen, int newonly);
-extern void		__dev_addr_discard(struct dev_addr_list **list);
 extern void		dev_set_promiscuity(struct net_device *dev, int inc);
 extern void		dev_set_allmulti(struct net_device *dev, int inc);
 extern void		netdev_state_change(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 13a0d9f6da54..3ba63aaa3001 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2715,7 +2715,7 @@ int __dev_addr_add(struct dev_addr_list **list, int *count,
 	return 0;
 }
 
-void __dev_addr_discard(struct dev_addr_list **list)
+static void __dev_addr_discard(struct dev_addr_list **list)
 {
 	struct dev_addr_list *tmp;
 
@@ -2785,6 +2785,18 @@ static void dev_unicast_discard(struct net_device *dev)
 	netif_tx_unlock_bh(dev);
 }
 
+/*
+ *	Discard multicast list when a device is downed
+ */
+
+static void dev_mc_discard(struct net_device *dev)
+{
+	netif_tx_lock_bh(dev);
+	__dev_addr_discard(&dev->mc_list);
+	dev->mc_count = 0;
+	netif_tx_unlock_bh(dev);
+}
+
 unsigned dev_get_flags(const struct net_device *dev)
 {
 	unsigned flags;
diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
index 235a2a8a0d05..99aece1aeccf 100644
--- a/net/core/dev_mcast.c
+++ b/net/core/dev_mcast.c
@@ -177,18 +177,6 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from)
 }
 EXPORT_SYMBOL(dev_mc_unsync);
 
-/*
- *	Discard multicast list when a device is downed
- */
-
-void dev_mc_discard(struct net_device *dev)
-{
-	netif_tx_lock_bh(dev);
-	__dev_addr_discard(&dev->mc_list);
-	dev->mc_count = 0;
-	netif_tx_unlock_bh(dev);
-}
-
 #ifdef CONFIG_PROC_FS
 static void *dev_mc_seq_start(struct seq_file *seq, loff_t *pos)
 {
-- 
cgit v1.3-8-gc7d7


From ebd61cc042b16e6cf2486aafbfff9e4be8c213ee Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 18 Jul 2007 02:21:50 -0700
Subject: [NETFILTER]: ipt_iprange.h must #include <linux/types.h>

ipt_iprange.h must #include <linux/types.h> since it uses __be32.

This patch fixes kernel Bugzilla #7604.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ipt_iprange.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ipt_iprange.h b/include/linux/netfilter_ipv4/ipt_iprange.h
index 34ab0fb736e2..a92fefc3c7ec 100644
--- a/include/linux/netfilter_ipv4/ipt_iprange.h
+++ b/include/linux/netfilter_ipv4/ipt_iprange.h
@@ -1,6 +1,8 @@
 #ifndef _IPT_IPRANGE_H
 #define _IPT_IPRANGE_H
 
+#include <linux/types.h>
+
 #define IPRANGE_SRC		0x01	/* Match source IP address */
 #define IPRANGE_DST		0x02	/* Match destination IP address */
 #define IPRANGE_SRC_INV		0x10	/* Negate the condition */
-- 
cgit v1.3-8-gc7d7


From 749269facaf87f6e516c3af12763e03181b9c139 Mon Sep 17 00:00:00 2001
From: Amit Arora <aarora@in.ibm.com>
Date: Wed, 18 Jul 2007 09:02:56 -0400
Subject: Change on-disk format to support 2^15 uninitialized extents

This change was suggested by Andreas Dilger.
This patch changes the EXT_MAX_LEN value and extent code which marks/checks
uninitialized extents. With this change it will be possible to have
initialized extents with 2^15 blocks (earlier the max blocks we could have
was 2^15 - 1). This way we can have better extent-to-block alignment.
Now, maximum number of blocks we can have in an initialized extent is 2^15
and in an uninitialized extent is 2^15 - 1.

Signed-off-by: Amit Arora <aarora@in.ibm.com>
---
 fs/ext4/extents.c               | 28 +++++++++++++++++++++++++---
 include/linux/ext4_fs_extents.h | 31 +++++++++++++++++++++++++++----
 2 files changed, 52 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ded3d469f978..77146b826a13 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1107,7 +1107,7 @@ static int
 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 				struct ext4_extent *ex2)
 {
-	unsigned short ext1_ee_len, ext2_ee_len;
+	unsigned short ext1_ee_len, ext2_ee_len, max_len;
 
 	/*
 	 * Make sure that either both extents are uninitialized, or
@@ -1116,6 +1116,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
 		return 0;
 
+	if (ext4_ext_is_uninitialized(ex1))
+		max_len = EXT_UNINIT_MAX_LEN;
+	else
+		max_len = EXT_INIT_MAX_LEN;
+
 	ext1_ee_len = ext4_ext_get_actual_len(ex1);
 	ext2_ee_len = ext4_ext_get_actual_len(ex2);
 
@@ -1128,7 +1133,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 	 * as an RO_COMPAT feature, refuse to merge to extents if
 	 * this can result in the top bit of ee_len being set.
 	 */
-	if (ext1_ee_len + ext2_ee_len > EXT_MAX_LEN)
+	if (ext1_ee_len + ext2_ee_len > max_len)
 		return 0;
 #ifdef AGGRESSIVE_TEST
 	if (le16_to_cpu(ex1->ee_len) >= 4)
@@ -1815,7 +1820,11 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 
 		ex->ee_block = cpu_to_le32(block);
 		ex->ee_len = cpu_to_le16(num);
-		if (uninitialized)
+		/*
+		 * Do not mark uninitialized if all the blocks in the
+		 * extent have been removed.
+		 */
+		if (uninitialized && num)
 			ext4_ext_mark_uninitialized(ex);
 
 		err = ext4_ext_dirty(handle, inode, path + depth);
@@ -2308,6 +2317,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	/* allocate new block */
 	goal = ext4_ext_find_goal(inode, path, iblock);
 
+	/*
+	 * See if request is beyond maximum number of blocks we can have in
+	 * a single extent. For an initialized extent this limit is
+	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
+	 * EXT_UNINIT_MAX_LEN.
+	 */
+	if (max_blocks > EXT_INIT_MAX_LEN &&
+	    create != EXT4_CREATE_UNINITIALIZED_EXT)
+		max_blocks = EXT_INIT_MAX_LEN;
+	else if (max_blocks > EXT_UNINIT_MAX_LEN &&
+		 create == EXT4_CREATE_UNINITIALIZED_EXT)
+		max_blocks = EXT_UNINIT_MAX_LEN;
+
 	/* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
 	newex.ee_block = cpu_to_le32(iblock);
 	newex.ee_len = cpu_to_le16(max_blocks);
diff --git a/include/linux/ext4_fs_extents.h b/include/linux/ext4_fs_extents.h
index edf49ec89eac..81406f3655d4 100644
--- a/include/linux/ext4_fs_extents.h
+++ b/include/linux/ext4_fs_extents.h
@@ -141,7 +141,25 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
 
 #define EXT_MAX_BLOCK	0xffffffff
 
-#define EXT_MAX_LEN	((1UL << 15) - 1)
+/*
+ * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
+ * initialized extent. This is 2^15 and not (2^16 - 1), since we use the
+ * MSB of ee_len field in the extent datastructure to signify if this
+ * particular extent is an initialized extent or an uninitialized (i.e.
+ * preallocated).
+ * EXT_UNINIT_MAX_LEN is the maximum number of blocks we can have in an
+ * uninitialized extent.
+ * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an
+ * uninitialized one. In other words, if MSB of ee_len is set, it is an
+ * uninitialized extent with only one special scenario when ee_len = 0x8000.
+ * In this case we can not have an uninitialized extent of zero length and
+ * thus we make it as a special case of initialized extent with 0x8000 length.
+ * This way we get better extent-to-group alignment for initialized extents.
+ * Hence, the maximum number of blocks we can have in an *initialized*
+ * extent is 2^15 (32768) and in an *uninitialized* extent is 2^15-1 (32767).
+ */
+#define EXT_INIT_MAX_LEN	(1UL << 15)
+#define EXT_UNINIT_MAX_LEN	(EXT_INIT_MAX_LEN - 1)
 
 
 #define EXT_FIRST_EXTENT(__hdr__) \
@@ -190,17 +208,22 @@ ext4_ext_invalidate_cache(struct inode *inode)
 
 static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
 {
-	ext->ee_len |= cpu_to_le16(0x8000);
+	/* We can not have an uninitialized extent of zero length! */
+	BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0);
+	ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN);
 }
 
 static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext)
 {
-	return (int)(le16_to_cpu((ext)->ee_len) & 0x8000);
+	/* Extent with ee_len of 0x8000 is treated as an initialized extent */
+	return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN);
 }
 
 static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
 {
-	return (int)(le16_to_cpu((ext)->ee_len) & 0x7FFF);
+	return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ?
+		le16_to_cpu(ext->ee_len) :
+		(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
 }
 
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
-- 
cgit v1.3-8-gc7d7


From ff9ddf7e847c4dc533f119efb6c77a6e57ab6397 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 18 Jul 2007 09:24:20 -0400
Subject: ext4: copy i_flags to inode flags on write

Propagate flags such as S_APPEND, S_IMMUTABLE, etc. from i_flags into
ext4-specific i_flags.  Quota code changes these flags on quota files
(to make it harder for sysadmin to screw himself) and these changes were
not correctly propagated into the filesystem.

(This is a forward port patch from ext3)

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c         | 20 ++++++++++++++++++++
 fs/ext4/ioctl.c         |  1 +
 include/linux/ext4_fs.h |  1 +
 3 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8416fa28c422..49035c5a2c43 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2583,6 +2583,25 @@ void ext4_set_inode_flags(struct inode *inode)
 		inode->i_flags |= S_DIRSYNC;
 }
 
+/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
+void ext4_get_inode_flags(struct ext4_inode_info *ei)
+{
+	unsigned int flags = ei->vfs_inode.i_flags;
+
+	ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
+			EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);
+	if (flags & S_SYNC)
+		ei->i_flags |= EXT4_SYNC_FL;
+	if (flags & S_APPEND)
+		ei->i_flags |= EXT4_APPEND_FL;
+	if (flags & S_IMMUTABLE)
+		ei->i_flags |= EXT4_IMMUTABLE_FL;
+	if (flags & S_NOATIME)
+		ei->i_flags |= EXT4_NOATIME_FL;
+	if (flags & S_DIRSYNC)
+		ei->i_flags |= EXT4_DIRSYNC_FL;
+}
+
 void ext4_read_inode(struct inode * inode)
 {
 	struct ext4_iloc iloc;
@@ -2744,6 +2763,7 @@ static int ext4_do_update_inode(handle_t *handle,
 	if (ei->i_state & EXT4_STATE_NEW)
 		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
 
+	ext4_get_inode_flags(ei);
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 	if(!(test_opt(inode->i_sb, NO_UID32))) {
 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7b4aa4543c83..9737432f079d 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -28,6 +28,7 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 
 	switch (cmd) {
 	case EXT4_IOC_GETFLAGS:
+		ext4_get_inode_flags(ei);
 		flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
 		return put_user(flags, (int __user *) arg);
 	case EXT4_IOC_SETFLAGS: {
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h
index 87c2d7a05b01..33b2b1a2d790 100644
--- a/include/linux/ext4_fs.h
+++ b/include/linux/ext4_fs.h
@@ -868,6 +868,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern void ext4_truncate (struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
+extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
-- 
cgit v1.3-8-gc7d7


From e23291b9120c11aafb2ee76fb71a062eb3c1056c Mon Sep 17 00:00:00 2001
From: "Jose R. Santos" <jrs@us.ibm.com>
Date: Wed, 18 Jul 2007 08:57:06 -0400
Subject: jbd2: Fix CONFIG_JBD_DEBUG ifdef to be CONFIG_JBD2_DEBUG

When the JBD code was forked to create the new JBD2 code base, the
references to CONFIG_JBD_DEBUG where never changed to
CONFIG_JBD2_DEBUG.  This patch fixes that.

Signed-off-by: Jose R. Santos <jrs@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c           |  4 ++--
 fs/ext4/ioctl.c            |  4 ++--
 fs/jbd2/journal.c          | 14 +++++++-------
 fs/jbd2/recovery.c         |  2 +-
 include/linux/ext4_fs.h    |  4 ++--
 include/linux/ext4_fs_sb.h |  2 +-
 include/linux/jbd2.h       |  4 ++--
 7 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9de54ae48dee..e53b4af52f11 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -517,7 +517,7 @@ do_more:
 		/*
 		 * An HJ special.  This is expensive...
 		 */
-#ifdef CONFIG_JBD_DEBUG
+#ifdef CONFIG_JBD2_DEBUG
 		jbd_unlock_bh_state(bitmap_bh);
 		{
 			struct buffer_head *debug_bh;
@@ -1597,7 +1597,7 @@ allocated:
 
 	performed_allocation = 1;
 
-#ifdef CONFIG_JBD_DEBUG
+#ifdef CONFIG_JBD2_DEBUG
 	{
 		struct buffer_head *debug_bh;
 
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 9737432f079d..5b00775d5096 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -141,7 +141,7 @@ flags_err:
 		ext4_journal_stop(handle);
 		return err;
 	}
-#ifdef CONFIG_JBD_DEBUG
+#ifdef CONFIG_JBD2_DEBUG
 	case EXT4_IOC_WAIT_FOR_READONLY:
 		/*
 		 * This is racy - by the time we're woken up and running,
@@ -283,7 +283,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC32_SETVERSION_OLD:
 		cmd = EXT4_IOC_SETVERSION_OLD;
 		break;
-#ifdef CONFIG_JBD_DEBUG
+#ifdef CONFIG_JBD2_DEBUG
 	case EXT4_IOC32_WAIT_FOR_READONLY:
 		cmd = EXT4_IOC_WAIT_FOR_READONLY;
 		break;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 78d63b818f0b..8f530cc66d34 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -528,7 +528,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 {
 	int err = 0;
 
-#ifdef CONFIG_JBD_DEBUG
+#ifdef CONFIG_JBD2_DEBUG
 	spin_lock(&journal->j_state_lock);
 	if (!tid_geq(journal->j_commit_request, tid)) {
 		printk(KERN_EMERG
@@ -1709,7 +1709,7 @@ void jbd2_slab_free(void *ptr,  size_t size)
  * Journal_head storage management
  */
 static struct kmem_cache *jbd2_journal_head_cache;
-#ifdef CONFIG_JBD_DEBUG
+#ifdef CONFIG_JBD2_DEBUG
 static atomic_t nr_journal_heads = ATOMIC_INIT(0);
 #endif
 
@@ -1747,7 +1747,7 @@ static struct journal_head *journal_alloc_journal_head(void)
 	struct journal_head *ret;
 	static unsigned long last_warning;
 
-#ifdef CONFIG_JBD_DEBUG
+#ifdef CONFIG_JBD2_DEBUG
 	atomic_inc(&nr_journal_heads);
 #endif
 	ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
@@ -1768,7 +1768,7 @@ static struct journal_head *journal_alloc_journal_head(void)
 
 static void journal_free_journal_head(struct journal_head *jh)
 {
-#ifdef CONFIG_JBD_DEBUG
+#ifdef CONFIG_JBD2_DEBUG
 	atomic_dec(&nr_journal_heads);
 	memset(jh, JBD_POISON_FREE, sizeof(*jh));
 #endif
@@ -1953,12 +1953,12 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
 /*
  * /proc tunables
  */
-#if defined(CONFIG_JBD_DEBUG)
+#if defined(CONFIG_JBD2_DEBUG)
 int jbd2_journal_enable_debug;
 EXPORT_SYMBOL(jbd2_journal_enable_debug);
 #endif
 
-#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS)
+#if defined(CONFIG_JBD2_DEBUG) && defined(CONFIG_PROC_FS)
 
 static struct proc_dir_entry *proc_jbd_debug;
 
@@ -2073,7 +2073,7 @@ static int __init journal_init(void)
 
 static void __exit journal_exit(void)
 {
-#ifdef CONFIG_JBD_DEBUG
+#ifdef CONFIG_JBD2_DEBUG
 	int n = atomic_read(&nr_journal_heads);
 	if (n)
 		printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 395c92a04ac9..e7730a045b93 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -295,7 +295,7 @@ int jbd2_journal_skip_recovery(journal_t *journal)
 		printk(KERN_ERR "JBD: error %d scanning journal\n", err);
 		++journal->j_transaction_sequence;
 	} else {
-#ifdef CONFIG_JBD_DEBUG
+#ifdef CONFIG_JBD2_DEBUG
 		int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
 #endif
 		jbd_debug(0,
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h
index 33b2b1a2d790..45ec7258b2b1 100644
--- a/include/linux/ext4_fs.h
+++ b/include/linux/ext4_fs.h
@@ -243,7 +243,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC_GROUP_ADD		_IOW('f', 8,struct ext4_new_group_input)
 #define	EXT4_IOC_GETVERSION_OLD		FS_IOC_GETVERSION
 #define	EXT4_IOC_SETVERSION_OLD		FS_IOC_SETVERSION
-#ifdef CONFIG_JBD_DEBUG
+#ifdef CONFIG_JBD2_DEBUG
 #define EXT4_IOC_WAIT_FOR_READONLY	_IOR('f', 99, long)
 #endif
 #define EXT4_IOC_GETRSVSZ		_IOR('f', 5, long)
@@ -259,7 +259,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_GETRSVSZ		_IOR('f', 5, int)
 #define EXT4_IOC32_SETRSVSZ		_IOW('f', 6, int)
 #define EXT4_IOC32_GROUP_EXTEND		_IOW('f', 7, unsigned int)
-#ifdef CONFIG_JBD_DEBUG
+#ifdef CONFIG_JBD2_DEBUG
 #define EXT4_IOC32_WAIT_FOR_READONLY	_IOR('f', 99, int)
 #endif
 #define EXT4_IOC32_GETVERSION_OLD	FS_IOC32_GETVERSION
diff --git a/include/linux/ext4_fs_sb.h b/include/linux/ext4_fs_sb.h
index 2347557a327a..0f7dc15924bf 100644
--- a/include/linux/ext4_fs_sb.h
+++ b/include/linux/ext4_fs_sb.h
@@ -73,7 +73,7 @@ struct ext4_sb_info {
 	struct list_head s_orphan;
 	unsigned long s_commit_interval;
 	struct block_device *journal_bdev;
-#ifdef CONFIG_JBD_DEBUG
+#ifdef CONFIG_JBD2_DEBUG
 	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
 	wait_queue_head_t ro_wait_queue;	/* For people waiting for the fs to go read-only */
 #endif
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 0e0fedd2039a..a37aca31de46 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -50,11 +50,11 @@
  */
 #define JBD_DEFAULT_MAX_COMMIT_AGE 5
 
-#ifdef CONFIG_JBD_DEBUG
+#ifdef CONFIG_JBD2_DEBUG
 /*
  * Define JBD_EXPENSIVE_CHECKING to enable more expensive internal
  * consistency checks.  By default we don't do this unless
- * CONFIG_JBD_DEBUG is on.
+ * CONFIG_JBD2_DEBUG is on.
  */
 #define JBD_EXPENSIVE_CHECKING
 extern int jbd2_journal_enable_debug;
-- 
cgit v1.3-8-gc7d7


From 0f49d5d019afa4e94253bfc92f0daca3badb990b Mon Sep 17 00:00:00 2001
From: "Jose R. Santos" <jrs@us.ibm.com>
Date: Wed, 18 Jul 2007 08:50:18 -0400
Subject: jbd2: Move jbd2-debug file to debugfs

The jbd2-debug file used to be located in /proc/sys/fs/jbd2-debug, but it
incorrectly used create_proc_entry() instead of the sysctl routines, and
no proc entry was ever created.

Instead of fixing this we might as well move the jbd2-debug file to
debugfs which would be the preferred location for this kind of tunable.
The new location is now /sys/kernel/debug/jbd2/jbd2-debug.

Signed-off-by: Jose R. Santos <jrs@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/Kconfig           | 10 ++++----
 fs/jbd2/journal.c    | 67 +++++++++++++++++++++-------------------------------
 include/linux/jbd2.h |  2 +-
 3 files changed, 33 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index 613df554728d..6a649902c5ac 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -251,7 +251,7 @@ config JBD2
 
 config JBD2_DEBUG
 	bool "JBD2 (ext4dev/ext4) debugging support"
-	depends on JBD2
+	depends on JBD2 && DEBUG_FS
 	help
 	  If you are using the ext4dev/ext4 journaled file system (or
 	  potentially any other filesystem/device using JBD2), this option
@@ -260,10 +260,10 @@ config JBD2_DEBUG
 	  By default, the debugging output will be turned off.
 
 	  If you select Y here, then you will be able to turn on debugging
-	  with "echo N > /proc/sys/fs/jbd2-debug", where N is a number between
-	  1 and 5. The higher the number, the more debugging output is
-	  generated.  To turn debugging off again, do
-	  "echo 0 > /proc/sys/fs/jbd2-debug".
+	  with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
+	  number between 1 and 5. The higher the number, the more debugging
+	  output is generated.  To turn debugging off again, do
+	  "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
 
 config FS_MBCACHE
 # Meta block cache for Extended Attributes (ext2/ext3/ext4)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8f530cc66d34..f290cb7cb834 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -35,6 +35,7 @@
 #include <linux/kthread.h>
 #include <linux/poison.h>
 #include <linux/proc_fs.h>
+#include <linux/debugfs.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -1951,64 +1952,50 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
 }
 
 /*
- * /proc tunables
+ * debugfs tunables
  */
 #if defined(CONFIG_JBD2_DEBUG)
-int jbd2_journal_enable_debug;
+u8 jbd2_journal_enable_debug;
 EXPORT_SYMBOL(jbd2_journal_enable_debug);
 #endif
 
-#if defined(CONFIG_JBD2_DEBUG) && defined(CONFIG_PROC_FS)
+#if defined(CONFIG_JBD2_DEBUG) && defined(CONFIG_DEBUG_FS)
 
-static struct proc_dir_entry *proc_jbd_debug;
+#define JBD2_DEBUG_NAME "jbd2-debug"
 
-static int read_jbd_debug(char *page, char **start, off_t off,
-			  int count, int *eof, void *data)
-{
-	int ret;
+struct dentry *jbd2_debugfs_dir, *jbd2_debug;
 
-	ret = sprintf(page + off, "%d\n", jbd2_journal_enable_debug);
-	*eof = 1;
-	return ret;
+static void __init jbd2_create_debugfs_entry(void)
+{
+	jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL);
+	if (jbd2_debugfs_dir)
+		jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, S_IRUGO,
+					       jbd2_debugfs_dir,
+					       &jbd2_journal_enable_debug);
 }
 
-static int write_jbd_debug(struct file *file, const char __user *buffer,
-			   unsigned long count, void *data)
+static void __exit jbd2_remove_debugfs_entry(void)
 {
-	char buf[32];
-
-	if (count > ARRAY_SIZE(buf) - 1)
-		count = ARRAY_SIZE(buf) - 1;
-	if (copy_from_user(buf, buffer, count))
-		return -EFAULT;
-	buf[ARRAY_SIZE(buf) - 1] = '\0';
-	jbd2_journal_enable_debug = simple_strtoul(buf, NULL, 10);
-	return count;
+	if (jbd2_debug)
+		debugfs_remove(jbd2_debug);
+	if (jbd2_debugfs_dir)
+		debugfs_remove(jbd2_debugfs_dir);
 }
 
-#define JBD_PROC_NAME "sys/fs/jbd2-debug"
+#else
 
-static void __init create_jbd_proc_entry(void)
+static void __init jbd2_create_debugfs_entry(void)
 {
-	proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL);
-	if (proc_jbd_debug) {
-		/* Why is this so hard? */
-		proc_jbd_debug->read_proc = read_jbd_debug;
-		proc_jbd_debug->write_proc = write_jbd_debug;
-	}
+	do {
+	} while (0);
 }
 
-static void __exit jbd2_remove_jbd_proc_entry(void)
+static void __exit jbd2_remove_debugfs_entry(void)
 {
-	if (proc_jbd_debug)
-		remove_proc_entry(JBD_PROC_NAME, NULL);
+	do {
+	} while (0);
 }
 
-#else
-
-#define create_jbd_proc_entry() do {} while (0)
-#define jbd2_remove_jbd_proc_entry() do {} while (0)
-
 #endif
 
 struct kmem_cache *jbd2_handle_cache;
@@ -2067,7 +2054,7 @@ static int __init journal_init(void)
 	ret = journal_init_caches();
 	if (ret != 0)
 		jbd2_journal_destroy_caches();
-	create_jbd_proc_entry();
+	jbd2_create_debugfs_entry();
 	return ret;
 }
 
@@ -2078,7 +2065,7 @@ static void __exit journal_exit(void)
 	if (n)
 		printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
 #endif
-	jbd2_remove_jbd_proc_entry();
+	jbd2_remove_debugfs_entry();
 	jbd2_journal_destroy_caches();
 }
 
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index a37aca31de46..260d6d76c5f3 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -57,7 +57,7 @@
  * CONFIG_JBD2_DEBUG is on.
  */
 #define JBD_EXPENSIVE_CHECKING
-extern int jbd2_journal_enable_debug;
+extern u8 jbd2_journal_enable_debug;
 
 #define jbd_debug(n, f, a...)						\
 	do {								\
-- 
cgit v1.3-8-gc7d7


From ef7f38359ea8b3e9c7f2cae9a4d4935f55ca9e80 Mon Sep 17 00:00:00 2001
From: Kalpak Shah <kalpak@clusterfs.com>
Date: Wed, 18 Jul 2007 09:15:20 -0400
Subject: ext4: Add nanosecond timestamps

This patch adds nanosecond timestamps for ext4. This involves adding
*time_extra fields to the ext4_inode to extend the timestamps to
64-bits.  Creation time is also added by this patch.

These extended fields will fit into an inode if the filesystem was
formatted with large inodes (-I 256 or larger) and there are currently
no EAs consuming all of the available space. For new inodes we always
reserve enough space for the kernel's known extended fields, but for
inodes created with an old kernel this might not have been the case. So
this patch also adds the EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE feature
flag(ro-compat so that older kernels can't create inodes with a smaller
extra_isize). which indicates if the fields fitting inside
s_min_extra_isize are available or not.  If the expansion of inodes if
unsuccessful then this feature will be disabled.  This feature is only
enabled if requested by the sysadmin.

None of the extended inode fields is critical for correct filesystem
operation.

Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Kalpak Shah <kalpak@clusterfs.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c           |  8 ++---
 fs/ext4/inode.c            | 22 +++++++-----
 fs/ext4/ioctl.c            |  4 +--
 fs/ext4/namei.c            | 16 ++++-----
 fs/ext4/super.c            | 28 +++++++++++++++
 fs/ext4/xattr.c            |  2 +-
 include/linux/ext4_fs.h    | 86 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/ext4_fs_i.h  |  5 +++
 include/linux/ext4_fs_sb.h |  1 +
 9 files changed, 147 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c88b439ba5cd..427f83066a0d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -563,7 +563,8 @@ got:
 	inode->i_ino = ino;
 	/* This is the optimal IO size (for stat), not the fs block size */
 	inode->i_blocks = 0;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
+						       ext4_current_time(inode);
 
 	memset(ei->i_data, 0, sizeof(ei->i_data));
 	ei->i_dir_start_lookup = 0;
@@ -595,9 +596,8 @@ got:
 	spin_unlock(&sbi->s_next_gen_lock);
 
 	ei->i_state = EXT4_STATE_NEW;
-	ei->i_extra_isize =
-		(EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) ?
-		sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE : 0;
+
+	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
 
 	ret = inode;
 	if(DQUOT_ALLOC_INODE(inode)) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 49035c5a2c43..b83f91edebd1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -726,7 +726,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 
 	/* We are done with atomic stuff, now do the rest of housekeeping */
 
-	inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 
 	/* had we spliced it onto indirect block? */
@@ -2375,7 +2375,7 @@ do_indirects:
 	ext4_discard_reservation(inode);
 
 	mutex_unlock(&ei->truncate_mutex);
-	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 
 	/*
@@ -2629,10 +2629,6 @@ void ext4_read_inode(struct inode * inode)
 	}
 	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
 	inode->i_size = le32_to_cpu(raw_inode->i_size);
-	inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
-	inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
-	inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
-	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
 
 	ei->i_state = 0;
 	ei->i_dir_start_lookup = 0;
@@ -2710,6 +2706,11 @@ void ext4_read_inode(struct inode * inode)
 	} else
 		ei->i_extra_isize = 0;
 
+	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
+	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
+	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
+	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
+
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext4_file_inode_operations;
 		inode->i_fop = &ext4_file_operations;
@@ -2791,9 +2792,12 @@ static int ext4_do_update_inode(handle_t *handle,
 	}
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
 	raw_inode->i_size = cpu_to_le32(ei->i_disksize);
-	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
-	raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
-	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+
+	EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
+	EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
+	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
+
 	raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
 	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 5b00775d5096..c04c7ccba9e3 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -97,7 +97,7 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		ei->i_flags = flags;
 
 		ext4_set_inode_flags(inode);
-		inode->i_ctime = CURRENT_TIME_SEC;
+		inode->i_ctime = ext4_current_time(inode);
 
 		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 flags_err:
@@ -134,7 +134,7 @@ flags_err:
 			return PTR_ERR(handle);
 		err = ext4_reserve_inode_write(handle, inode, &iloc);
 		if (err == 0) {
-			inode->i_ctime = CURRENT_TIME_SEC;
+			inode->i_ctime = ext4_current_time(inode);
 			inode->i_generation = generation;
 			err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 		}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2de339dd7554..40106b7ea4b8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1295,7 +1295,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	 * happen is that the times are slightly out of date
 	 * and/or different from the directory change time.
 	 */
-	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
 	ext4_update_dx_flag(dir);
 	dir->i_version++;
 	ext4_mark_inode_dirty(handle, dir);
@@ -2056,7 +2056,7 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
 	 * recovery. */
 	inode->i_size = 0;
 	ext4_orphan_add(handle, inode);
-	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	drop_nlink(dir);
 	ext4_update_dx_flag(dir);
@@ -2106,13 +2106,13 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
 	retval = ext4_delete_entry(handle, dir, de, bh);
 	if (retval)
 		goto end_unlink;
-	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
 	ext4_update_dx_flag(dir);
 	ext4_mark_inode_dirty(handle, dir);
 	drop_nlink(inode);
 	if (!inode->i_nlink)
 		ext4_orphan_add(handle, inode);
-	inode->i_ctime = dir->i_ctime;
+	inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	retval = 0;
 
@@ -2203,7 +2203,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		handle->h_sync = 1;
 
-	inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_ctime = ext4_current_time(inode);
 	inc_nlink(inode);
 	atomic_inc(&inode->i_count);
 
@@ -2305,7 +2305,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
 	 * Like most other Unix systems, set the ctime for inodes on a
 	 * rename.
 	 */
-	old_inode->i_ctime = CURRENT_TIME_SEC;
+	old_inode->i_ctime = ext4_current_time(old_inode);
 	ext4_mark_inode_dirty(handle, old_inode);
 
 	/*
@@ -2338,9 +2338,9 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
 
 	if (new_inode) {
 		drop_nlink(new_inode);
-		new_inode->i_ctime = CURRENT_TIME_SEC;
+		new_inode->i_ctime = ext4_current_time(new_inode);
 	}
-	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+	old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
 	ext4_update_dx_flag(old_dir);
 	if (dir_bh) {
 		BUFFER_TRACE(dir_bh, "get_write_access");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index af0835187e76..b47259f6f39c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1651,6 +1651,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 				sbi->s_inode_size);
 			goto failed_mount;
 		}
+		if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
+			sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
 	}
 	sbi->s_frag_size = EXT4_MIN_FRAG_SIZE <<
 				   le32_to_cpu(es->s_log_frag_size);
@@ -1874,6 +1876,32 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	}
 
 	ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+
+	/* determine the minimum size of new large inodes, if present */
+	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
+		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
+						     EXT4_GOOD_OLD_INODE_SIZE;
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) {
+			if (sbi->s_want_extra_isize <
+			    le16_to_cpu(es->s_want_extra_isize))
+				sbi->s_want_extra_isize =
+					le16_to_cpu(es->s_want_extra_isize);
+			if (sbi->s_want_extra_isize <
+			    le16_to_cpu(es->s_min_extra_isize))
+				sbi->s_want_extra_isize =
+					le16_to_cpu(es->s_min_extra_isize);
+		}
+	}
+	/* Check if enough inode space is available */
+	if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
+							sbi->s_inode_size) {
+		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
+						       EXT4_GOOD_OLD_INODE_SIZE;
+		printk(KERN_INFO "EXT4-fs: required extra inode space not"
+			"available.\n");
+	}
+
 	/*
 	 * akpm: core read_super() calls in here with the superblock locked.
 	 * That deadlocks, because orphan cleanup needs to lock the superblock
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index e832e96095b3..fe16a569d06b 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1013,7 +1013,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	}
 	if (!error) {
 		ext4_xattr_update_super_block(handle, inode->i_sb);
-		inode->i_ctime = CURRENT_TIME_SEC;
+		inode->i_ctime = ext4_current_time(inode);
 		error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
 		/*
 		 * The bh is consumed by ext4_mark_iloc_dirty, even with
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h
index 45ec7258b2b1..df5e38faa15f 100644
--- a/include/linux/ext4_fs.h
+++ b/include/linux/ext4_fs.h
@@ -288,7 +288,7 @@ struct ext4_inode {
 	__le16	i_uid;		/* Low 16 bits of Owner Uid */
 	__le32	i_size;		/* Size in bytes */
 	__le32	i_atime;	/* Access time */
-	__le32	i_ctime;	/* Creation time */
+	__le32	i_ctime;	/* Inode Change time */
 	__le32	i_mtime;	/* Modification time */
 	__le32	i_dtime;	/* Deletion Time */
 	__le16	i_gid;		/* Low 16 bits of Group Id */
@@ -337,10 +337,85 @@ struct ext4_inode {
 	} osd2;				/* OS dependent 2 */
 	__le16	i_extra_isize;
 	__le16	i_pad1;
+	__le32  i_ctime_extra;  /* extra Change time      (nsec << 2 | epoch) */
+	__le32  i_mtime_extra;  /* extra Modification time(nsec << 2 | epoch) */
+	__le32  i_atime_extra;  /* extra Access time      (nsec << 2 | epoch) */
+	__le32  i_crtime;       /* File Creation time */
+	__le32  i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
 };
 
 #define i_size_high	i_dir_acl
 
+#define EXT4_EPOCH_BITS 2
+#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
+#define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)
+
+/*
+ * Extended fields will fit into an inode if the filesystem was formatted
+ * with large inodes (-I 256 or larger) and there are not currently any EAs
+ * consuming all of the available space. For new inodes we always reserve
+ * enough space for the kernel's known extended fields, but for inodes
+ * created with an old kernel this might not have been the case. None of
+ * the extended inode fields is critical for correct filesystem operation.
+ * This macro checks if a certain field fits in the inode. Note that
+ * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize
+ */
+#define EXT4_FITS_IN_INODE(ext4_inode, einode, field)	\
+	((offsetof(typeof(*ext4_inode), field) +	\
+	  sizeof((ext4_inode)->field))			\
+	<= (EXT4_GOOD_OLD_INODE_SIZE +			\
+	    (einode)->i_extra_isize))			\
+
+static inline __le32 ext4_encode_extra_time(struct timespec *time)
+{
+       return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
+			   time->tv_sec >> 32 : 0) |
+			   ((time->tv_nsec << 2) & EXT4_NSEC_MASK));
+}
+
+static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
+{
+       if (sizeof(time->tv_sec) > 4)
+	       time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
+			       << 32;
+       time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2;
+}
+
+#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode)			       \
+do {									       \
+	(raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec);	       \
+	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
+		(raw_inode)->xtime ## _extra =				       \
+				ext4_encode_extra_time(&(inode)->xtime);       \
+} while (0)
+
+#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode)			       \
+do {									       \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))		       \
+		(raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec);      \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))	       \
+		(raw_inode)->xtime ## _extra =				       \
+				ext4_encode_extra_time(&(einode)->xtime);      \
+} while (0)
+
+#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode)			       \
+do {									       \
+	(inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime);       \
+	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
+		ext4_decode_extra_time(&(inode)->xtime,			       \
+				       raw_inode->xtime ## _extra);	       \
+} while (0)
+
+#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)			       \
+do {									       \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))		       \
+		(einode)->xtime.tv_sec = 				       \
+			(signed)le32_to_cpu((raw_inode)->xtime);	       \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))	       \
+		ext4_decode_extra_time(&(einode)->xtime,		       \
+				       raw_inode->xtime ## _extra);	       \
+} while (0)
+
 #if defined(__KERNEL__) || defined(__linux__)
 #define i_reserved1	osd1.linux1.l_i_reserved1
 #define i_frag		osd2.linux2.l_i_frag
@@ -539,6 +614,13 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
 	return container_of(inode, struct ext4_inode_info, vfs_inode);
 }
 
+static inline struct timespec ext4_current_time(struct inode *inode)
+{
+	return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
+		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
+}
+
+
 static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 {
 	return ino == EXT4_ROOT_INO ||
@@ -609,6 +691,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
 #define EXT4_FEATURE_RO_COMPAT_BTREE_DIR	0x0004
+#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE	0x0040
 
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
@@ -626,6 +709,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 					 EXT4_FEATURE_INCOMPAT_64BIT)
 #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+					 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
 					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
 
 /*
diff --git a/include/linux/ext4_fs_i.h b/include/linux/ext4_fs_i.h
index 9de494406995..1a511e9905aa 100644
--- a/include/linux/ext4_fs_i.h
+++ b/include/linux/ext4_fs_i.h
@@ -153,6 +153,11 @@ struct ext4_inode_info {
 
 	unsigned long i_ext_generation;
 	struct ext4_ext_cache i_cached_extent;
+	/*
+	 * File creation time. Its function is same as that of
+	 * struct timespec i_{a,c,m}time in the generic inode.
+	 */
+	struct timespec i_crtime;
 };
 
 #endif	/* _LINUX_EXT4_FS_I */
diff --git a/include/linux/ext4_fs_sb.h b/include/linux/ext4_fs_sb.h
index 0f7dc15924bf..1b2ffee12be9 100644
--- a/include/linux/ext4_fs_sb.h
+++ b/include/linux/ext4_fs_sb.h
@@ -81,6 +81,7 @@ struct ext4_sb_info {
 	char *s_qf_names[MAXQUOTAS];		/* Names of quota files with journalled quota */
 	int s_jquota_fmt;			/* Format of quota to use */
 #endif
+	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
 
 #ifdef EXTENTS_STATS
 	/* ext4 extents stats */
-- 
cgit v1.3-8-gc7d7


From 6dd4ee7cab7e3a17c571aebd444f4344c8c4946e Mon Sep 17 00:00:00 2001
From: Kalpak Shah <kalpak@clusterfs.com>
Date: Wed, 18 Jul 2007 09:19:57 -0400
Subject: ext4: Expand extra_inodes space per the s_{want,min}_extra_isize
 fields

We need to make sure that existing ext3 filesystems can also avail the
new fields that have been added to the ext4 inode. We use
s_want_extra_isize and s_min_extra_isize to decide by how much we should
expand the inode. If EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE feature is set
then we expand the inode by max(s_want_extra_isize, s_min_extra_isize ,
sizeof(ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE) bytes. Actually it is
still an open question about whether users should be able to set
s_*_extra_isize smaller than the known fields or not.

This patch also adds the functionality to expand inodes to include the
newly added fields. We start by trying to expand by s_want_extra_isize
bytes and if its fails we try to expand by s_min_extra_isize bytes. This
is done by changing the i_extra_isize if enough space is available in
the inode and no EAs are present. If EAs are present and there is enough
space in the inode then the EAs in the inode are shifted to make space.
If enough space is not available in the inode due to the EAs then 1 or
more EAs are shifted to the external EA block. In the worst case when
even the external EA block does not have enough space we inform the user
that some EA would need to be deleted or s_min_extra_isize would have to
be reduced.

Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Kalpak Shah <kalpak@clusterfs.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c         |  63 ++++++++++-
 fs/ext4/xattr.c         | 274 ++++++++++++++++++++++++++++++++++++++++++++++--
 fs/ext4/xattr.h         |  17 +++
 include/linux/ext4_fs.h |   1 +
 4 files changed, 347 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b83f91edebd1..f6d8528c4f55 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3105,6 +3105,39 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
 	return err;
 }
 
+/*
+ * Expand an inode by new_extra_isize bytes.
+ * Returns 0 on success or negative error number on failure.
+ */
+int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize,
+			struct ext4_iloc iloc, handle_t *handle)
+{
+	struct ext4_inode *raw_inode;
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_entry *entry;
+
+	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
+		return 0;
+
+	raw_inode = ext4_raw_inode(&iloc);
+
+	header = IHDR(inode, raw_inode);
+	entry = IFIRST(header);
+
+	/* No extended attributes present */
+	if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
+		header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+		memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
+			new_extra_isize);
+		EXT4_I(inode)->i_extra_isize = new_extra_isize;
+		return 0;
+	}
+
+	/* try to expand with EAs present */
+	return ext4_expand_extra_isize_ea(inode, new_extra_isize,
+					  raw_inode, handle);
+}
+
 /*
  * What we do here is to mark the in-core inode as clean with respect to inode
  * dirtiness (it may still be data-dirty).
@@ -3129,10 +3162,38 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 {
 	struct ext4_iloc iloc;
-	int err;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	static unsigned int mnt_count;
+	int err, ret;
 
 	might_sleep();
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+	    !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
+		/*
+		 * We need extra buffer credits since we may write into EA block
+		 * with this same handle. If journal_extend fails, then it will
+		 * only result in a minor loss of functionality for that inode.
+		 * If this is felt to be critical, then e2fsck should be run to
+		 * force a large enough s_min_extra_isize.
+		 */
+		if ((jbd2_journal_extend(handle,
+			     EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
+			ret = ext4_expand_extra_isize(inode,
+						      sbi->s_want_extra_isize,
+						      iloc, handle);
+			if (ret) {
+				EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+				if (mnt_count != sbi->s_es->s_mnt_count) {
+					ext4_warning(inode->i_sb, __FUNCTION__,
+					"Unable to expand inode %lu. Delete"
+					" some EAs or run e2fsck.",
+					inode->i_ino);
+					mnt_count = sbi->s_es->s_mnt_count;
+				}
+			}
+		}
+	}
 	if (!err)
 		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 	return err;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fe16a569d06b..b10d68fffb55 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -66,13 +66,6 @@
 #define BFIRST(bh) ENTRY(BHDR(bh)+1)
 #define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
 
-#define IHDR(inode, raw_inode) \
-	((struct ext4_xattr_ibody_header *) \
-		((void *)raw_inode + \
-		 EXT4_GOOD_OLD_INODE_SIZE + \
-		 EXT4_I(inode)->i_extra_isize))
-#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
-
 #ifdef EXT4_XATTR_DEBUG
 # define ea_idebug(inode, f...) do { \
 		printk(KERN_DEBUG "inode %s:%lu: ", \
@@ -508,6 +501,24 @@ out:
 	return;
 }
 
+/*
+ * Find the available free space for EAs. This also returns the total number of
+ * bytes used by EA entries.
+ */
+static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
+				    size_t *min_offs, void *base, int *total)
+{
+	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+		*total += EXT4_XATTR_LEN(last->e_name_len);
+		if (!last->e_value_block && last->e_value_size) {
+			size_t offs = le16_to_cpu(last->e_value_offs);
+			if (offs < *min_offs)
+				*min_offs = offs;
+		}
+	}
+	return (*min_offs - ((void *)last - base) - sizeof(__u32));
+}
+
 struct ext4_xattr_info {
 	int name_index;
 	const char *name;
@@ -1014,6 +1025,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	if (!error) {
 		ext4_xattr_update_super_block(handle, inode->i_sb);
 		inode->i_ctime = ext4_current_time(inode);
+		if (!value)
+			EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
 		error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
 		/*
 		 * The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1066,6 +1079,253 @@ retry:
 	return error;
 }
 
+/*
+ * Shift the EA entries in the inode to create space for the increased
+ * i_extra_isize.
+ */
+static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
+				     int value_offs_shift, void *to,
+				     void *from, size_t n, int blocksize)
+{
+	struct ext4_xattr_entry *last = entry;
+	int new_offs;
+
+	/* Adjust the value offsets of the entries */
+	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+		if (!last->e_value_block && last->e_value_size) {
+			new_offs = le16_to_cpu(last->e_value_offs) +
+							value_offs_shift;
+			BUG_ON(new_offs + le32_to_cpu(last->e_value_size)
+				 > blocksize);
+			last->e_value_offs = cpu_to_le16(new_offs);
+		}
+	}
+	/* Shift the entries by n bytes */
+	memmove(to, from, n);
+}
+
+/*
+ * Expand an inode by new_extra_isize bytes when EAs are present.
+ * Returns 0 on success or negative error number on failure.
+ */
+int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+			       struct ext4_inode *raw_inode, handle_t *handle)
+{
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_entry *entry, *last, *first;
+	struct buffer_head *bh = NULL;
+	struct ext4_xattr_ibody_find *is = NULL;
+	struct ext4_xattr_block_find *bs = NULL;
+	char *buffer = NULL, *b_entry_name = NULL;
+	size_t min_offs, free;
+	int total_ino, total_blk;
+	void *base, *start, *end;
+	int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
+	int s_min_extra_isize = EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize;
+
+	down_write(&EXT4_I(inode)->xattr_sem);
+retry:
+	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) {
+		up_write(&EXT4_I(inode)->xattr_sem);
+		return 0;
+	}
+
+	header = IHDR(inode, raw_inode);
+	entry = IFIRST(header);
+
+	/*
+	 * Check if enough free space is available in the inode to shift the
+	 * entries ahead by new_extra_isize.
+	 */
+
+	base = start = entry;
+	end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+	min_offs = end - base;
+	last = entry;
+	total_ino = sizeof(struct ext4_xattr_ibody_header);
+
+	free = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
+	if (free >= new_extra_isize) {
+		entry = IFIRST(header);
+		ext4_xattr_shift_entries(entry,	EXT4_I(inode)->i_extra_isize
+				- new_extra_isize, (void *)raw_inode +
+				EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize,
+				(void *)header, total_ino,
+				inode->i_sb->s_blocksize);
+		EXT4_I(inode)->i_extra_isize = new_extra_isize;
+		error = 0;
+		goto cleanup;
+	}
+
+	/*
+	 * Enough free space isn't available in the inode, check if
+	 * EA block can hold new_extra_isize bytes.
+	 */
+	if (EXT4_I(inode)->i_file_acl) {
+		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+		error = -EIO;
+		if (!bh)
+			goto cleanup;
+		if (ext4_xattr_check_block(bh)) {
+			ext4_error(inode->i_sb, __FUNCTION__,
+				"inode %lu: bad block %llu", inode->i_ino,
+				EXT4_I(inode)->i_file_acl);
+			error = -EIO;
+			goto cleanup;
+		}
+		base = BHDR(bh);
+		first = BFIRST(bh);
+		end = bh->b_data + bh->b_size;
+		min_offs = end - base;
+		free = ext4_xattr_free_space(first, &min_offs, base,
+					     &total_blk);
+		if (free < new_extra_isize) {
+			if (!tried_min_extra_isize && s_min_extra_isize) {
+				tried_min_extra_isize++;
+				new_extra_isize = s_min_extra_isize;
+				brelse(bh);
+				goto retry;
+			}
+			error = -1;
+			goto cleanup;
+		}
+	} else {
+		free = inode->i_sb->s_blocksize;
+	}
+
+	while (new_extra_isize > 0) {
+		size_t offs, size, entry_size;
+		struct ext4_xattr_entry *small_entry = NULL;
+		struct ext4_xattr_info i = {
+			.value = NULL,
+			.value_len = 0,
+		};
+		unsigned int total_size;  /* EA entry size + value size */
+		unsigned int shift_bytes; /* No. of bytes to shift EAs by? */
+		unsigned int min_total_size = ~0U;
+
+		is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
+		bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
+		if (!is || !bs) {
+			error = -ENOMEM;
+			goto cleanup;
+		}
+
+		is->s.not_found = -ENODATA;
+		bs->s.not_found = -ENODATA;
+		is->iloc.bh = NULL;
+		bs->bh = NULL;
+
+		last = IFIRST(header);
+		/* Find the entry best suited to be pushed into EA block */
+		entry = NULL;
+		for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+			total_size =
+			EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) +
+					EXT4_XATTR_LEN(last->e_name_len);
+			if (total_size <= free && total_size < min_total_size) {
+				if (total_size < new_extra_isize) {
+					small_entry = last;
+				} else {
+					entry = last;
+					min_total_size = total_size;
+				}
+			}
+		}
+
+		if (entry == NULL) {
+			if (small_entry) {
+				entry = small_entry;
+			} else {
+				if (!tried_min_extra_isize &&
+				    s_min_extra_isize) {
+					tried_min_extra_isize++;
+					new_extra_isize = s_min_extra_isize;
+					goto retry;
+				}
+				error = -1;
+				goto cleanup;
+			}
+		}
+		offs = le16_to_cpu(entry->e_value_offs);
+		size = le32_to_cpu(entry->e_value_size);
+		entry_size = EXT4_XATTR_LEN(entry->e_name_len);
+		i.name_index = entry->e_name_index,
+		buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS);
+		b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
+		if (!buffer || !b_entry_name) {
+			error = -ENOMEM;
+			goto cleanup;
+		}
+		/* Save the entry name and the entry value */
+		memcpy(buffer, (void *)IFIRST(header) + offs,
+		       EXT4_XATTR_SIZE(size));
+		memcpy(b_entry_name, entry->e_name, entry->e_name_len);
+		b_entry_name[entry->e_name_len] = '\0';
+		i.name = b_entry_name;
+
+		error = ext4_get_inode_loc(inode, &is->iloc);
+		if (error)
+			goto cleanup;
+
+		error = ext4_xattr_ibody_find(inode, &i, is);
+		if (error)
+			goto cleanup;
+
+		/* Remove the chosen entry from the inode */
+		error = ext4_xattr_ibody_set(handle, inode, &i, is);
+
+		entry = IFIRST(header);
+		if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
+			shift_bytes = new_extra_isize;
+		else
+			shift_bytes = entry_size + size;
+		/* Adjust the offsets and shift the remaining entries ahead */
+		ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize -
+			shift_bytes, (void *)raw_inode +
+			EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes,
+			(void *)header, total_ino - entry_size,
+			inode->i_sb->s_blocksize);
+
+		extra_isize += shift_bytes;
+		new_extra_isize -= shift_bytes;
+		EXT4_I(inode)->i_extra_isize = extra_isize;
+
+		i.name = b_entry_name;
+		i.value = buffer;
+		i.value_len = cpu_to_le32(size);
+		error = ext4_xattr_block_find(inode, &i, bs);
+		if (error)
+			goto cleanup;
+
+		/* Add entry which was removed from the inode into the block */
+		error = ext4_xattr_block_set(handle, inode, &i, bs);
+		if (error)
+			goto cleanup;
+		kfree(b_entry_name);
+		kfree(buffer);
+		brelse(is->iloc.bh);
+		kfree(is);
+		kfree(bs);
+	}
+	brelse(bh);
+	up_write(&EXT4_I(inode)->xattr_sem);
+	return 0;
+
+cleanup:
+	kfree(b_entry_name);
+	kfree(buffer);
+	if (is)
+		brelse(is->iloc.bh);
+	kfree(is);
+	kfree(bs);
+	brelse(bh);
+	up_write(&EXT4_I(inode)->xattr_sem);
+	return error;
+}
+
+
+
 /*
  * ext4_xattr_delete_inode()
  *
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 79432b35398f..d7f5d6a12651 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -56,6 +56,13 @@ struct ext4_xattr_entry {
 #define EXT4_XATTR_SIZE(size) \
 	(((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
 
+#define IHDR(inode, raw_inode) \
+	((struct ext4_xattr_ibody_header *) \
+		((void *)raw_inode + \
+		EXT4_GOOD_OLD_INODE_SIZE + \
+		EXT4_I(inode)->i_extra_isize))
+#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
+
 # ifdef CONFIG_EXT4DEV_FS_XATTR
 
 extern struct xattr_handler ext4_xattr_user_handler;
@@ -74,6 +81,9 @@ extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *,
 extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
 extern void ext4_xattr_put_super(struct super_block *);
 
+extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+			    struct ext4_inode *raw_inode, handle_t *handle);
+
 extern int init_ext4_xattr(void);
 extern void exit_ext4_xattr(void);
 
@@ -129,6 +139,13 @@ exit_ext4_xattr(void)
 {
 }
 
+static inline int
+ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+			    struct ext4_inode *raw_inode, handle_t *handle)
+{
+	return -EOPNOTSUPP;
+}
+
 #define ext4_xattr_handlers	NULL
 
 # endif  /* CONFIG_EXT4DEV_FS_XATTR */
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h
index df5e38faa15f..52dcc24dd986 100644
--- a/include/linux/ext4_fs.h
+++ b/include/linux/ext4_fs.h
@@ -202,6 +202,7 @@ struct ext4_group_desc
 #define EXT4_STATE_JDATA		0x00000001 /* journaled data exists */
 #define EXT4_STATE_NEW			0x00000002 /* inode is newly created */
 #define EXT4_STATE_XATTR		0x00000004 /* has in-inode xattrs */
+#define EXT4_STATE_NO_EXPAND		0x00000008 /* No space for expansion */
 
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
-- 
cgit v1.3-8-gc7d7


From f8628a14a27eb4512a1ede43de1d9db4d9f92bc3 Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@clusterfs.com>
Date: Wed, 18 Jul 2007 08:38:01 -0400
Subject: ext4: Remove 65000 subdirectory limit

This patch adds support to ext4 for allowing more than 65000
subdirectories. Currently the maximum number of subdirectories is capped
at 32000.

If we exceed 65000 subdirectories in an htree directory it sets the
inode link count to 1 and no longer counts subdirectories.  The
directory link count is not actually used when determining if a
directory is empty, as that only counts subdirectories and not regular
files that might be in there.

A EXT4_FEATURE_RO_COMPAT_DIR_NLINK flag has been added and it is set if
the subdir count for any directory crosses 65000. A later fsck will clear
EXT4_FEATURE_RO_COMPAT_DIR_NLINK if there are no longer any directory
with >65000 subdirs.

Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Kalpak Shah <kalpak@clusterfs.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c         | 60 +++++++++++++++++++++++++++++++++++++------------
 include/linux/ext4_fs.h |  4 +++-
 2 files changed, 49 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 40106b7ea4b8..da224974af78 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1629,6 +1629,35 @@ static int ext4_delete_entry (handle_t *handle,
 	return -ENOENT;
 }
 
+/*
+ * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
+ * since this indicates that nlinks count was previously 1.
+ */
+static void ext4_inc_count(handle_t *handle, struct inode *inode)
+{
+	inc_nlink(inode);
+	if (is_dx(inode) && inode->i_nlink > 1) {
+		/* limit is 16-bit i_links_count */
+		if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
+			inode->i_nlink = 1;
+			EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
+					      EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
+		}
+	}
+}
+
+/*
+ * If a directory had nlink == 1, then we should let it be 1. This indicates
+ * directory has >EXT4_LINK_MAX subdirs.
+ */
+static void ext4_dec_count(handle_t *handle, struct inode *inode)
+{
+	drop_nlink(inode);
+	if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0)
+		inc_nlink(inode);
+}
+
+
 static int ext4_add_nondir(handle_t *handle,
 		struct dentry *dentry, struct inode *inode)
 {
@@ -1725,7 +1754,7 @@ static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 	struct ext4_dir_entry_2 * de;
 	int err, retries = 0;
 
-	if (dir->i_nlink >= EXT4_LINK_MAX)
+	if (EXT4_DIR_LINK_MAX(dir))
 		return -EMLINK;
 
 retry:
@@ -1748,7 +1777,7 @@ retry:
 	inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
 	dir_block = ext4_bread (handle, inode, 0, 1, &err);
 	if (!dir_block) {
-		drop_nlink(inode); /* is this nlink == 0? */
+		ext4_dec_count(handle, inode); /* is this nlink == 0? */
 		ext4_mark_inode_dirty(handle, inode);
 		iput (inode);
 		goto out_stop;
@@ -1780,7 +1809,7 @@ retry:
 		iput (inode);
 		goto out_stop;
 	}
-	inc_nlink(dir);
+	ext4_inc_count(handle, dir);
 	ext4_update_dx_flag(dir);
 	ext4_mark_inode_dirty(handle, dir);
 	d_instantiate(dentry, inode);
@@ -2045,9 +2074,9 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
 	retval = ext4_delete_entry(handle, dir, de, bh);
 	if (retval)
 		goto end_rmdir;
-	if (inode->i_nlink != 2)
+	if (!EXT4_DIR_LINK_EMPTY(inode))
 		ext4_warning (inode->i_sb, "ext4_rmdir",
-			      "empty directory has nlink!=2 (%d)",
+			      "empty directory has too many links (%d)",
 			      inode->i_nlink);
 	inode->i_version++;
 	clear_nlink(inode);
@@ -2058,7 +2087,7 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
 	ext4_orphan_add(handle, inode);
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
-	drop_nlink(dir);
+	ext4_dec_count(handle, dir);
 	ext4_update_dx_flag(dir);
 	ext4_mark_inode_dirty(handle, dir);
 
@@ -2109,7 +2138,7 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
 	dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
 	ext4_update_dx_flag(dir);
 	ext4_mark_inode_dirty(handle, dir);
-	drop_nlink(inode);
+	ext4_dec_count(handle, inode);
 	if (!inode->i_nlink)
 		ext4_orphan_add(handle, inode);
 	inode->i_ctime = ext4_current_time(inode);
@@ -2159,7 +2188,7 @@ retry:
 		err = __page_symlink(inode, symname, l,
 				mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 		if (err) {
-			drop_nlink(inode);
+			ext4_dec_count(handle, inode);
 			ext4_mark_inode_dirty(handle, inode);
 			iput (inode);
 			goto out_stop;
@@ -2185,8 +2214,9 @@ static int ext4_link (struct dentry * old_dentry,
 	struct inode *inode = old_dentry->d_inode;
 	int err, retries = 0;
 
-	if (inode->i_nlink >= EXT4_LINK_MAX)
+	if (EXT4_DIR_LINK_MAX(inode))
 		return -EMLINK;
+
 	/*
 	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
 	 * otherwise has the potential to corrupt the orphan inode list.
@@ -2204,7 +2234,7 @@ retry:
 		handle->h_sync = 1;
 
 	inode->i_ctime = ext4_current_time(inode);
-	inc_nlink(inode);
+	ext4_inc_count(handle, inode);
 	atomic_inc(&inode->i_count);
 
 	err = ext4_add_nondir(handle, dentry, inode);
@@ -2337,7 +2367,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
 	}
 
 	if (new_inode) {
-		drop_nlink(new_inode);
+		ext4_dec_count(handle, new_inode);
 		new_inode->i_ctime = ext4_current_time(new_inode);
 	}
 	old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
@@ -2348,11 +2378,13 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
 		PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
 		BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata");
 		ext4_journal_dirty_metadata(handle, dir_bh);
-		drop_nlink(old_dir);
+		ext4_dec_count(handle, old_dir);
 		if (new_inode) {
-			drop_nlink(new_inode);
+			/* checked empty_dir above, can't have another parent,
+			 * ext3_dec_count() won't work for many-linked dirs */
+			new_inode->i_nlink = 0;
 		} else {
-			inc_nlink(new_dir);
+			ext4_inc_count(handle, new_dir);
 			ext4_update_dx_flag(new_dir);
 			ext4_mark_inode_dirty(handle, new_dir);
 		}
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h
index 52dcc24dd986..cdee7aaa57aa 100644
--- a/include/linux/ext4_fs.h
+++ b/include/linux/ext4_fs.h
@@ -71,7 +71,7 @@
 /*
  * Maximal count of links to a file
  */
-#define EXT4_LINK_MAX		32000
+#define EXT4_LINK_MAX		65000
 
 /*
  * Macro-instructions used to manage several block sizes
@@ -692,6 +692,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
 #define EXT4_FEATURE_RO_COMPAT_BTREE_DIR	0x0004
+#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK	0x0020
 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE	0x0040
 
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
@@ -710,6 +711,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 					 EXT4_FEATURE_INCOMPAT_64BIT)
 #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+					 EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
 					 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
 					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
 
-- 
cgit v1.3-8-gc7d7


From b187f180cc942e50007aa039f8e3a620ee5f3171 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Wed, 18 Jul 2007 00:49:10 -0700
Subject: serial: add early_serial_setup() back to header file

early_serial_setup was removed from serial.h, but forgot to put in
serial_8250.h

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/frv/kernel/setup.c                   | 1 +
 arch/mips/basler/excite/excite_setup.c    | 1 +
 arch/mips/gt64120/wrppmc/setup.c          | 1 +
 arch/mips/mips-boards/atlas/atlas_setup.c | 1 +
 arch/mips/mips-boards/sead/sead_setup.c   | 1 +
 arch/mips/mipssim/sim_setup.c             | 1 +
 arch/mips/pmc-sierra/msp71xx/msp_serial.c | 1 +
 arch/mips/pmc-sierra/yosemite/setup.c     | 1 +
 arch/ppc/platforms/4xx/bamboo.c           | 1 +
 arch/ppc/platforms/4xx/bubinga.c          | 1 +
 arch/ppc/platforms/4xx/cpci405.c          | 1 +
 arch/ppc/platforms/4xx/ebony.c            | 1 +
 arch/ppc/platforms/4xx/luan.c             | 1 +
 arch/ppc/platforms/4xx/ocotea.c           | 1 +
 arch/ppc/platforms/4xx/taishan.c          | 1 +
 arch/ppc/platforms/4xx/yucca.c            | 1 +
 arch/ppc/platforms/85xx/sbc8560.c         | 1 +
 arch/ppc/platforms/chestnut.c             | 1 +
 arch/ppc/platforms/ev64260.c              | 1 +
 arch/ppc/platforms/radstone_ppc7d.c       | 1 +
 arch/ppc/platforms/spruce.c               | 1 +
 drivers/parisc/superio.c                  | 1 +
 drivers/serial/8250_hp300.c               | 1 +
 include/linux/serial_8250.h               | 2 ++
 24 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/arch/frv/kernel/setup.c b/arch/frv/kernel/setup.c
index c1c32e4c863d..a74c08786b21 100644
--- a/arch/frv/kernel/setup.c
+++ b/arch/frv/kernel/setup.c
@@ -29,6 +29,7 @@
 #include <linux/serial.h>
 #include <linux/serial_core.h>
 #include <linux/serial_reg.h>
+#include <linux/serial_8250.h>
 
 #include <asm/setup.h>
 #include <asm/irq.h>
diff --git a/arch/mips/basler/excite/excite_setup.c b/arch/mips/basler/excite/excite_setup.c
index 2f0e4c08eb04..56003188f17c 100644
--- a/arch/mips/basler/excite/excite_setup.c
+++ b/arch/mips/basler/excite/excite_setup.c
@@ -26,6 +26,7 @@
 #include <linux/tty.h>
 #include <linux/serial_core.h>
 #include <linux/serial.h>
+#include <linux/serial_8250.h>
 #include <linux/ioport.h>
 #include <linux/spinlock.h>
 #include <asm/bootinfo.h>
diff --git a/arch/mips/gt64120/wrppmc/setup.c b/arch/mips/gt64120/wrppmc/setup.c
index ea965529e5e0..ed58c13b6032 100644
--- a/arch/mips/gt64120/wrppmc/setup.c
+++ b/arch/mips/gt64120/wrppmc/setup.c
@@ -14,6 +14,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 #include <linux/pm.h>
 
 #include <asm/io.h>
diff --git a/arch/mips/mips-boards/atlas/atlas_setup.c b/arch/mips/mips-boards/atlas/atlas_setup.c
index 1cc6ebbedfdd..c68358a476dd 100644
--- a/arch/mips/mips-boards/atlas/atlas_setup.c
+++ b/arch/mips/mips-boards/atlas/atlas_setup.c
@@ -22,6 +22,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/cpu.h>
 #include <asm/bootinfo.h>
diff --git a/arch/mips/mips-boards/sead/sead_setup.c b/arch/mips/mips-boards/sead/sead_setup.c
index bb801409d39b..5f70eaf01fab 100644
--- a/arch/mips/mips-boards/sead/sead_setup.c
+++ b/arch/mips/mips-boards/sead/sead_setup.c
@@ -23,6 +23,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/cpu.h>
 #include <asm/bootinfo.h>
diff --git a/arch/mips/mipssim/sim_setup.c b/arch/mips/mipssim/sim_setup.c
index 60e66906be65..17819b594105 100644
--- a/arch/mips/mipssim/sim_setup.c
+++ b/arch/mips/mipssim/sim_setup.c
@@ -26,6 +26,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/cpu.h>
 #include <asm/bootinfo.h>
diff --git a/arch/mips/pmc-sierra/msp71xx/msp_serial.c b/arch/mips/pmc-sierra/msp71xx/msp_serial.c
index c41b53faa8f6..e25bac537d77 100644
--- a/arch/mips/pmc-sierra/msp71xx/msp_serial.c
+++ b/arch/mips/pmc-sierra/msp71xx/msp_serial.c
@@ -32,6 +32,7 @@
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/serial.h>
+#include <linux/serial_8250.h>
 
 #include <msp_prom.h>
 #include <msp_int.h>
diff --git a/arch/mips/pmc-sierra/yosemite/setup.c b/arch/mips/pmc-sierra/yosemite/setup.c
index 6a6e15e40009..f7f93ae24c34 100644
--- a/arch/mips/pmc-sierra/yosemite/setup.c
+++ b/arch/mips/pmc-sierra/yosemite/setup.c
@@ -39,6 +39,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/time.h>
 #include <asm/bootinfo.h>
diff --git a/arch/ppc/platforms/4xx/bamboo.c b/arch/ppc/platforms/4xx/bamboo.c
index 349660b84a02..017623c9bc4b 100644
--- a/arch/ppc/platforms/4xx/bamboo.c
+++ b/arch/ppc/platforms/4xx/bamboo.c
@@ -29,6 +29,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 #include <linux/ethtool.h>
 
 #include <asm/system.h>
diff --git a/arch/ppc/platforms/4xx/bubinga.c b/arch/ppc/platforms/4xx/bubinga.c
index 1a7f075b754f..cd696be55aca 100644
--- a/arch/ppc/platforms/4xx/bubinga.c
+++ b/arch/ppc/platforms/4xx/bubinga.c
@@ -21,6 +21,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/system.h>
 #include <asm/pci-bridge.h>
diff --git a/arch/ppc/platforms/4xx/cpci405.c b/arch/ppc/platforms/4xx/cpci405.c
index 8474b05b795a..2e7e25dd84cb 100644
--- a/arch/ppc/platforms/4xx/cpci405.c
+++ b/arch/ppc/platforms/4xx/cpci405.c
@@ -23,6 +23,7 @@
 #include <asm/todc.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 #include <asm/ocp.h>
 #include <asm/ibm_ocp_pci.h>
 #include <platforms/4xx/ibm405gp.h>
diff --git a/arch/ppc/platforms/4xx/ebony.c b/arch/ppc/platforms/4xx/ebony.c
index f0f9cc8480ca..05d7184d7e14 100644
--- a/arch/ppc/platforms/4xx/ebony.c
+++ b/arch/ppc/platforms/4xx/ebony.c
@@ -32,6 +32,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
diff --git a/arch/ppc/platforms/4xx/luan.c b/arch/ppc/platforms/4xx/luan.c
index 61706ef37112..4b169610f154 100644
--- a/arch/ppc/platforms/4xx/luan.c
+++ b/arch/ppc/platforms/4xx/luan.c
@@ -30,6 +30,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
diff --git a/arch/ppc/platforms/4xx/ocotea.c b/arch/ppc/platforms/4xx/ocotea.c
index 5e994e146ba8..fd0f971881d6 100644
--- a/arch/ppc/platforms/4xx/ocotea.c
+++ b/arch/ppc/platforms/4xx/ocotea.c
@@ -30,6 +30,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
diff --git a/arch/ppc/platforms/4xx/taishan.c b/arch/ppc/platforms/4xx/taishan.c
index 5d9af8ddb155..888c492b4a45 100644
--- a/arch/ppc/platforms/4xx/taishan.c
+++ b/arch/ppc/platforms/4xx/taishan.c
@@ -30,6 +30,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 #include <linux/platform_device.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/nand.h>
diff --git a/arch/ppc/platforms/4xx/yucca.c b/arch/ppc/platforms/4xx/yucca.c
index 346787df0ddb..a83b0baea011 100644
--- a/arch/ppc/platforms/4xx/yucca.c
+++ b/arch/ppc/platforms/4xx/yucca.c
@@ -31,6 +31,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
diff --git a/arch/ppc/platforms/85xx/sbc8560.c b/arch/ppc/platforms/85xx/sbc8560.c
index 1d10ab98f66d..3d7addbdecfd 100644
--- a/arch/ppc/platforms/85xx/sbc8560.c
+++ b/arch/ppc/platforms/85xx/sbc8560.c
@@ -26,6 +26,7 @@
 #include <linux/serial.h>
 #include <linux/tty.h>	/* for linux/serial_core.h */
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 #include <linux/initrd.h>
 #include <linux/module.h>
 #include <linux/fsl_devices.h>
diff --git a/arch/ppc/platforms/chestnut.c b/arch/ppc/platforms/chestnut.c
index a764ae71cbcb..248684f50dd9 100644
--- a/arch/ppc/platforms/chestnut.c
+++ b/arch/ppc/platforms/chestnut.c
@@ -25,6 +25,7 @@
 #include <linux/ide.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 #include <linux/mtd/physmap.h>
 #include <asm/system.h>
 #include <asm/pgtable.h>
diff --git a/arch/ppc/platforms/ev64260.c b/arch/ppc/platforms/ev64260.c
index 4957a7bcde22..976270d537c1 100644
--- a/arch/ppc/platforms/ev64260.c
+++ b/arch/ppc/platforms/ev64260.c
@@ -35,6 +35,7 @@
 #include <linux/serial.h>
 #include <linux/tty.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 #else
 #include <linux/mv643xx.h>
 #endif
diff --git a/arch/ppc/platforms/radstone_ppc7d.c b/arch/ppc/platforms/radstone_ppc7d.c
index b55860734a72..44d4398a36ff 100644
--- a/arch/ppc/platforms/radstone_ppc7d.c
+++ b/arch/ppc/platforms/radstone_ppc7d.c
@@ -35,6 +35,7 @@
 #include <linux/serial.h>
 #include <linux/tty.h>		/* for linux/serial_core.h */
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 #include <linux/mv643xx.h>
 #include <linux/netdevice.h>
 #include <linux/platform_device.h>
diff --git a/arch/ppc/platforms/spruce.c b/arch/ppc/platforms/spruce.c
index 3c7842784876..f4de50ba292e 100644
--- a/arch/ppc/platforms/spruce.c
+++ b/arch/ppc/platforms/spruce.c
@@ -27,6 +27,7 @@
 #include <linux/serial.h>
 #include <linux/tty.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
diff --git a/drivers/parisc/superio.c b/drivers/parisc/superio.c
index a708c329675e..38cdf9fa36a7 100644
--- a/drivers/parisc/superio.c
+++ b/drivers/parisc/superio.c
@@ -73,6 +73,7 @@
 #include <linux/termios.h>
 #include <linux/tty.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 #include <linux/delay.h>
 
 #include <asm/io.h>
diff --git a/drivers/serial/8250_hp300.c b/drivers/serial/8250_hp300.c
index 53e81a44c1a3..2cf0953fe0ec 100644
--- a/drivers/serial/8250_hp300.c
+++ b/drivers/serial/8250_hp300.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/serial_8250.h>
 #include <linux/delay.h>
 #include <linux/dio.h>
 #include <linux/console.h>
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 706ee9a4c80c..8518fa2a6f89 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -60,6 +60,8 @@ void serial8250_unregister_port(int line);
 void serial8250_suspend_port(int line);
 void serial8250_resume_port(int line);
 
+extern int early_serial_setup(struct uart_port *port);
+
 extern int serial8250_find_port(struct uart_port *p);
 extern int serial8250_find_port_for_earlycon(void);
 extern int setup_early_serial8250_console(char *cmdline);
-- 
cgit v1.3-8-gc7d7


From 8b4a40809e5330c9da5d20107d693d92d73b31dc Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Wed, 18 Jul 2007 00:49:11 -0700
Subject: zs: move to the serial subsystem

This is a reimplementation of the zs driver for the serial subsystem.  Any
resemblance to the old driver is purely coincidential.  ;-) I do hope I got
the handling of modem lines right -- better do not tackle me about the
issue unless you feel too good...

Any users of the old driver: please note the numbers of the serial lines
have now been swapped, i.e.  ttyS0 <-> ttyS1 and ttyS2 <-> ttyS3.  It has
to do with the modem lines mentioned above; basically the port A in a given
chip has to be initialised before the port B if you want to use the latter
as the serial console (which is usually the case), as operations on modem
lines of the serial line associated with the port B access both ports (see
the comment at the top of the driver for the details of wiring used).
Please update your scripts.

This is also the reason each SCC now requests an IRQ once only (as seen in
"/proc/interrupts") -- the handler takes care of both ports at once as the
line associated with the port B has to take status update interrupts from
both ports (and yet the line of the port A takes its own for itself too).
The old driver never got it right...

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS                   |    5 +
 drivers/char/Kconfig          |   33 -
 drivers/char/decserial.c      |   67 --
 drivers/serial/Kconfig        |   30 +
 drivers/serial/Makefile       |    1 +
 drivers/serial/zs.c           | 1287 ++++++++++++++++++++++++
 drivers/serial/zs.h           |  284 ++++++
 drivers/tc/Makefile           |    1 -
 drivers/tc/zs.c               | 2203 -----------------------------------------
 drivers/tc/zs.h               |  404 --------
 include/asm-mips/dec/serial.h |   36 -
 include/linux/serial_core.h   |    5 +-
 12 files changed, 1610 insertions(+), 2746 deletions(-)
 delete mode 100644 drivers/char/decserial.c
 create mode 100644 drivers/serial/zs.c
 create mode 100644 drivers/serial/zs.h
 delete mode 100644 drivers/tc/zs.c
 delete mode 100644 drivers/tc/zs.h
 delete mode 100644 include/asm-mips/dec/serial.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 368a7181fa14..a9615a567da6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4110,6 +4110,11 @@ W:	http://www.polyware.nl/~middelin/En/hobbies.html
 W:	http://www.polyware.nl/~middelin/hobbies.html
 S:	Maintained
 
+ZS DECSTATION Z85C30 SERIAL DRIVER
+P:	Maciej W. Rozycki
+M:	macro@linux-mips.org
+S:	Maintained
+
 THE REST
 P:	Linus Torvalds
 S:	Buried alive in reporters
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index d8d7125529c4..97bd71bc3aea 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -372,39 +372,6 @@ config ISTALLION
 	  To compile this driver as a module, choose M here: the
 	  module will be called istallion.
 
-config SERIAL_DEC
-	bool "DECstation serial support"
-	depends on MACH_DECSTATION
-	default y
-	help
-	  This selects whether you want to be asked about drivers for
-	  DECstation serial ports.
-
-	  Note that the answer to this question won't directly affect the
-	  kernel: saying N will just cause the configurator to skip all
-	  the questions about DECstation serial ports.
-
-config SERIAL_DEC_CONSOLE
-	bool "Support for console on a DECstation serial port"
-	depends on SERIAL_DEC
-	default y
-	help
-	  If you say Y here, it will be possible to use a serial port as the
-	  system console (the system console is the device which receives all
-	  kernel messages and warnings and which allows logins in single user
-	  mode).  Note that the firmware uses ttyS0 as the serial console on
-	  the Maxine and ttyS2 on the others.
-
-	  If unsure, say Y.
-
-config ZS
-	bool "Z85C30 Serial Support"
-	depends on SERIAL_DEC
-	default y
-	help
-	  Documentation on the Zilog 85C350 serial communications controller
-	  is downloadable at <http://www.zilog.com/pdfs/serial/z85c30.pdf>
-
 config A2232
 	tristate "Commodore A2232 serial support (EXPERIMENTAL)"
 	depends on EXPERIMENTAL && ZORRO && BROKEN_ON_SMP
diff --git a/drivers/char/decserial.c b/drivers/char/decserial.c
deleted file mode 100644
index 8ea2bea2b183..000000000000
--- a/drivers/char/decserial.c
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * sercons.c
- *      choose the right serial device at boot time
- *
- * triemer 6-SEP-1998
- *      sercons.c is designed to allow the three different kinds 
- *      of serial devices under the decstation world to co-exist
- *      in the same kernel.  The idea here is to abstract 
- *      the pieces of the drivers that are common to this file
- *      so that they do not clash at compile time and runtime.
- *
- * HK 16-SEP-1998 v0.002
- *      removed the PROM console as this is not a real serial
- *      device. Added support for PROM console in drivers/char/tty_io.c
- *      instead. Although it may work to enable more than one 
- *      console device I strongly recommend to use only one.
- */
-
-#include <linux/init.h>
-#include <asm/dec/machtype.h>
-
-#ifdef CONFIG_ZS
-extern int zs_init(void);
-#endif
-
-#ifdef CONFIG_SERIAL_CONSOLE
-
-#ifdef CONFIG_ZS
-extern void zs_serial_console_init(void);
-#endif
-
-#endif
-
-/* rs_init - starts up the serial interface -
-   handle normal case of starting up the serial interface */
-
-#ifdef CONFIG_SERIAL
-
-int __init rs_init(void)
-{
-#ifdef CONFIG_ZS
-    if (IOASIC)
-	return zs_init();
-#endif
-    return -ENXIO;
-}
-
-__initcall(rs_init);
-
-#endif
-
-#ifdef CONFIG_SERIAL_CONSOLE
-
-/* serial_console_init handles the special case of starting
- *   up the console on the serial port
- */
-static int __init decserial_console_init(void)
-{
-#ifdef CONFIG_ZS
-    if (IOASIC)
-	zs_serial_console_init();
-#endif
-    return 0;
-}
-console_initcall(decserial_console_init);
-
-#endif
diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 7fa413ddccf5..18f629706448 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -486,6 +486,36 @@ config SERIAL_DZ_CONSOLE
 
 	  If unsure, say Y.
 
+config SERIAL_ZS
+	tristate "DECstation Z85C30 serial support"
+	depends on MACH_DECSTATION
+	select SERIAL_CORE
+	default y
+	---help---
+	  Support for the Zilog 85C350 serial communications controller used
+	  for serial ports in newer DECstation systems.  These include the
+	  DECsystem 5900 and all models of the DECstation and DECsystem 5000
+	  systems except from model 200.
+
+	  If unsure, say Y.  To compile this driver as a module, choose M here:
+	  the module will be called zs.
+
+config SERIAL_ZS_CONSOLE
+	bool "Support for console on a DECstation Z85C30 serial port"
+	depends on SERIAL_ZS=y
+	select SERIAL_CORE_CONSOLE
+	default y
+	---help---
+	  If you say Y here, it will be possible to use a serial port as the
+	  system console (the system console is the device which receives all
+	  kernel messages and warnings and which allows logins in single user
+	  mode).
+
+	  Note that the firmware uses ttyS1 as the serial console on the
+	  Maxine and ttyS3 on the others using this driver.
+
+	  If unsure, say Y.
+
 config SERIAL_21285
 	tristate "DC21285 serial port support"
 	depends on ARM && FOOTBRIDGE
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index c48cdd61b736..af6377d480d7 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_V850E_UART) += v850e_uart.o
 obj-$(CONFIG_SERIAL_PMACZILOG) += pmac_zilog.o
 obj-$(CONFIG_SERIAL_LH7A40X) += serial_lh7a40x.o
 obj-$(CONFIG_SERIAL_DZ) += dz.o
+obj-$(CONFIG_SERIAL_ZS) += zs.o
 obj-$(CONFIG_SERIAL_SH_SCI) += sh-sci.o
 obj-$(CONFIG_SERIAL_SGI_L1_CONSOLE) += sn_console.o
 obj-$(CONFIG_SERIAL_CPM) += cpm_uart/
diff --git a/drivers/serial/zs.c b/drivers/serial/zs.c
new file mode 100644
index 000000000000..65f1294fd27b
--- /dev/null
+++ b/drivers/serial/zs.c
@@ -0,0 +1,1287 @@
+/*
+ * zs.c: Serial port driver for IOASIC DECstations.
+ *
+ * Derived from drivers/sbus/char/sunserial.c by Paul Mackerras.
+ * Derived from drivers/macintosh/macserial.c by Harald Koerfgen.
+ *
+ * DECstation changes
+ * Copyright (C) 1998-2000 Harald Koerfgen
+ * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007  Maciej W. Rozycki
+ *
+ * For the rest of the code the original Copyright applies:
+ * Copyright (C) 1996 Paul Mackerras (Paul.Mackerras@cs.anu.edu.au)
+ * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
+ *
+ *
+ * Note: for IOASIC systems the wiring is as follows:
+ *
+ * mouse/keyboard:
+ * DIN-7 MJ-4  signal        SCC
+ * 2     1     TxD       <-  A.TxD
+ * 3     4     RxD       ->  A.RxD
+ *
+ * EIA-232/EIA-423:
+ * DB-25 MMJ-6 signal        SCC
+ * 2     2     TxD       <-  B.TxD
+ * 3     5     RxD       ->  B.RxD
+ * 4           RTS       <- ~A.RTS
+ * 5           CTS       -> ~B.CTS
+ * 6     6     DSR       -> ~A.SYNC
+ * 8           CD        -> ~B.DCD
+ * 12          DSRS(DCE) -> ~A.CTS  (*)
+ * 15          TxC       ->  B.TxC
+ * 17          RxC       ->  B.RxC
+ * 20    1     DTR       <- ~A.DTR
+ * 22          RI        -> ~A.DCD
+ * 23          DSRS(DTE) <- ~B.RTS
+ *
+ * (*) EIA-232 defines the signal at this pin to be SCD, while DSRS(DCE)
+ *     is shared with DSRS(DTE) at pin 23.
+ *
+ * As you can immediately notice the wiring of the RTS, DTR and DSR signals
+ * is a bit odd.  This makes the handling of port B unnecessarily
+ * complicated and prevents the use of some automatic modes of operation.
+ */
+
+#if defined(CONFIG_SERIAL_ZS_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
+#define SUPPORT_SYSRQ
+#endif
+
+#include <linux/bug.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/irqflags.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/serial.h>
+#include <linux/serial_core.h>
+#include <linux/spinlock.h>
+#include <linux/sysrq.h>
+#include <linux/tty.h>
+#include <linux/types.h>
+
+#include <asm/atomic.h>
+#include <asm/system.h>
+
+#include <asm/dec/interrupts.h>
+#include <asm/dec/ioasic_addrs.h>
+#include <asm/dec/system.h>
+
+#include "zs.h"
+
+
+MODULE_AUTHOR("Maciej W. Rozycki <macro@linux-mips.org>");
+MODULE_DESCRIPTION("DECstation Z85C30 serial driver");
+MODULE_LICENSE("GPL");
+
+
+static char zs_name[] __initdata = "DECstation Z85C30 serial driver version ";
+static char zs_version[] __initdata = "0.10";
+
+/*
+ * It would be nice to dynamically allocate everything that
+ * depends on ZS_NUM_SCCS, so we could support any number of
+ * Z85C30s, but for now...
+ */
+#define ZS_NUM_SCCS	2		/* Max # of ZS chips supported.  */
+#define ZS_NUM_CHAN	2		/* 2 channels per chip.  */
+#define ZS_CHAN_A	0		/* Index of the channel A.  */
+#define ZS_CHAN_B	1		/* Index of the channel B.  */
+#define ZS_CHAN_IO_SIZE 8		/* IOMEM space size.  */
+#define ZS_CHAN_IO_STRIDE 4		/* Register alignment.  */
+#define ZS_CHAN_IO_OFFSET 1		/* The SCC resides on the high byte
+					   of the 16-bit IOBUS.  */
+#define ZS_CLOCK        7372800 	/* Z85C30 PCLK input clock rate.  */
+
+#define to_zport(uport) container_of(uport, struct zs_port, port)
+
+struct zs_parms {
+	resource_size_t scc[ZS_NUM_SCCS];
+	int irq[ZS_NUM_SCCS];
+};
+
+static struct zs_scc zs_sccs[ZS_NUM_SCCS];
+
+static u8 zs_init_regs[ZS_NUM_REGS] __initdata = {
+	0,				/* write 0 */
+	PAR_SPEC,			/* write 1 */
+	0,				/* write 2 */
+	0,				/* write 3 */
+	X16CLK | SB1,			/* write 4 */
+	0,				/* write 5 */
+	0, 0, 0,			/* write 6, 7, 8 */
+	MIE | DLC | NV,			/* write 9 */
+	NRZ,				/* write 10 */
+	TCBR | RCBR,			/* write 11 */
+	0, 0,				/* BRG time constant, write 12 + 13 */
+	BRSRC | BRENABL,		/* write 14 */
+	0,				/* write 15 */
+};
+
+/*
+ * Debugging.
+ */
+#undef ZS_DEBUG_REGS
+
+
+/*
+ * Reading and writing Z85C30 registers.
+ */
+static void recovery_delay(void)
+{
+	udelay(2);
+}
+
+static u8 read_zsreg(struct zs_port *zport, int reg)
+{
+	void __iomem *control = zport->port.membase + ZS_CHAN_IO_OFFSET;
+	u8 retval;
+
+	if (reg != 0) {
+		writeb(reg & 0xf, control);
+		fast_iob();
+		recovery_delay();
+	}
+	retval = readb(control);
+	recovery_delay();
+	return retval;
+}
+
+static void write_zsreg(struct zs_port *zport, int reg, u8 value)
+{
+	void __iomem *control = zport->port.membase + ZS_CHAN_IO_OFFSET;
+
+	if (reg != 0) {
+		writeb(reg & 0xf, control);
+		fast_iob(); recovery_delay();
+	}
+	writeb(value, control);
+	fast_iob();
+	recovery_delay();
+	return;
+}
+
+static u8 read_zsdata(struct zs_port *zport)
+{
+	void __iomem *data = zport->port.membase +
+			     ZS_CHAN_IO_STRIDE + ZS_CHAN_IO_OFFSET;
+	u8 retval;
+
+	retval = readb(data);
+	recovery_delay();
+	return retval;
+}
+
+static void write_zsdata(struct zs_port *zport, u8 value)
+{
+	void __iomem *data = zport->port.membase +
+			     ZS_CHAN_IO_STRIDE + ZS_CHAN_IO_OFFSET;
+
+	writeb(value, data);
+	fast_iob();
+	recovery_delay();
+	return;
+}
+
+#ifdef ZS_DEBUG_REGS
+void zs_dump(void)
+{
+	struct zs_port *zport;
+	int i, j;
+
+	for (i = 0; i < ZS_NUM_SCCS * ZS_NUM_CHAN; i++) {
+		zport = &zs_sccs[i / ZS_NUM_CHAN].zport[i % ZS_NUM_CHAN];
+
+		if (!zport->scc)
+			continue;
+
+		for (j = 0; j < 16; j++)
+			printk("W%-2d = 0x%02x\t", j, zport->regs[j]);
+		printk("\n");
+		for (j = 0; j < 16; j++)
+			printk("R%-2d = 0x%02x\t", j, read_zsreg(zport, j));
+		printk("\n\n");
+	}
+}
+#endif
+
+
+static void zs_spin_lock_cond_irq(spinlock_t *lock, int irq)
+{
+	if (irq)
+		spin_lock_irq(lock);
+	else
+		spin_lock(lock);
+}
+
+static void zs_spin_unlock_cond_irq(spinlock_t *lock, int irq)
+{
+	if (irq)
+		spin_unlock_irq(lock);
+	else
+		spin_unlock(lock);
+}
+
+static int zs_receive_drain(struct zs_port *zport)
+{
+	int loops = 10000;
+
+	while ((read_zsreg(zport, R0) & Rx_CH_AV) && loops--)
+		read_zsdata(zport);
+	return loops;
+}
+
+static int zs_transmit_drain(struct zs_port *zport, int irq)
+{
+	struct zs_scc *scc = zport->scc;
+	int loops = 10000;
+
+	while (!(read_zsreg(zport, R0) & Tx_BUF_EMP) && loops--) {
+		zs_spin_unlock_cond_irq(&scc->zlock, irq);
+		udelay(2);
+		zs_spin_lock_cond_irq(&scc->zlock, irq);
+	}
+	return loops;
+}
+
+static int zs_line_drain(struct zs_port *zport, int irq)
+{
+	struct zs_scc *scc = zport->scc;
+	int loops = 10000;
+
+	while (!(read_zsreg(zport, R1) & ALL_SNT) && loops--) {
+		zs_spin_unlock_cond_irq(&scc->zlock, irq);
+		udelay(2);
+		zs_spin_lock_cond_irq(&scc->zlock, irq);
+	}
+	return loops;
+}
+
+
+static void load_zsregs(struct zs_port *zport, u8 *regs, int irq)
+{
+	/* Let the current transmission finish.  */
+	zs_line_drain(zport, irq);
+	/* Load 'em up.  */
+	write_zsreg(zport, R3, regs[3] & ~RxENABLE);
+	write_zsreg(zport, R5, regs[5] & ~TxENAB);
+	write_zsreg(zport, R4, regs[4]);
+	write_zsreg(zport, R9, regs[9]);
+	write_zsreg(zport, R1, regs[1]);
+	write_zsreg(zport, R2, regs[2]);
+	write_zsreg(zport, R10, regs[10]);
+	write_zsreg(zport, R14, regs[14] & ~BRENABL);
+	write_zsreg(zport, R11, regs[11]);
+	write_zsreg(zport, R12, regs[12]);
+	write_zsreg(zport, R13, regs[13]);
+	write_zsreg(zport, R14, regs[14]);
+	write_zsreg(zport, R15, regs[15]);
+	if (regs[3] & RxENABLE)
+		write_zsreg(zport, R3, regs[3]);
+	if (regs[5] & TxENAB)
+		write_zsreg(zport, R5, regs[5]);
+	return;
+}
+
+
+/*
+ * Status handling routines.
+ */
+
+/*
+ * zs_tx_empty() -- get the transmitter empty status
+ *
+ * Purpose: Let user call ioctl() to get info when the UART physically
+ * 	    is emptied.  On bus types like RS485, the transmitter must
+ * 	    release the bus after transmitting.  This must be done when
+ * 	    the transmit shift register is empty, not be done when the
+ * 	    transmit holding register is empty.  This functionality
+ * 	    allows an RS485 driver to be written in user space.
+ */
+static unsigned int zs_tx_empty(struct uart_port *uport)
+{
+	struct zs_port *zport = to_zport(uport);
+	struct zs_scc *scc = zport->scc;
+	unsigned long flags;
+	u8 status;
+
+	spin_lock_irqsave(&scc->zlock, flags);
+	status = read_zsreg(zport, R1);
+	spin_unlock_irqrestore(&scc->zlock, flags);
+
+	return status & ALL_SNT ? TIOCSER_TEMT : 0;
+}
+
+static unsigned int zs_raw_get_ab_mctrl(struct zs_port *zport_a,
+					struct zs_port *zport_b)
+{
+	u8 status_a, status_b;
+	unsigned int mctrl;
+
+	status_a = read_zsreg(zport_a, R0);
+	status_b = read_zsreg(zport_b, R0);
+
+	mctrl = ((status_b & CTS) ? TIOCM_CTS : 0) |
+		((status_b & DCD) ? TIOCM_CAR : 0) |
+		((status_a & DCD) ? TIOCM_RNG : 0) |
+		((status_a & SYNC_HUNT) ? TIOCM_DSR : 0);
+
+	return mctrl;
+}
+
+static unsigned int zs_raw_get_mctrl(struct zs_port *zport)
+{
+	struct zs_port *zport_a = &zport->scc->zport[ZS_CHAN_A];
+
+	return zport != zport_a ? zs_raw_get_ab_mctrl(zport_a, zport) : 0;
+}
+
+static unsigned int zs_raw_xor_mctrl(struct zs_port *zport)
+{
+	struct zs_port *zport_a = &zport->scc->zport[ZS_CHAN_A];
+	unsigned int mmask, mctrl, delta;
+	u8 mask_a, mask_b;
+
+	if (zport == zport_a)
+		return 0;
+
+	mask_a = zport_a->regs[15];
+	mask_b = zport->regs[15];
+
+	mmask = ((mask_b & CTSIE) ? TIOCM_CTS : 0) |
+		((mask_b & DCDIE) ? TIOCM_CAR : 0) |
+		((mask_a & DCDIE) ? TIOCM_RNG : 0) |
+		((mask_a & SYNCIE) ? TIOCM_DSR : 0);
+
+	mctrl = zport->mctrl;
+	if (mmask) {
+		mctrl &= ~mmask;
+		mctrl |= zs_raw_get_ab_mctrl(zport_a, zport) & mmask;
+	}
+
+	delta = mctrl ^ zport->mctrl;
+	if (delta)
+		zport->mctrl = mctrl;
+
+	return delta;
+}
+
+static unsigned int zs_get_mctrl(struct uart_port *uport)
+{
+	struct zs_port *zport = to_zport(uport);
+	struct zs_scc *scc = zport->scc;
+	unsigned int mctrl;
+
+	spin_lock(&scc->zlock);
+	mctrl = zs_raw_get_mctrl(zport);
+	spin_unlock(&scc->zlock);
+
+	return mctrl;
+}
+
+static void zs_set_mctrl(struct uart_port *uport, unsigned int mctrl)
+{
+	struct zs_port *zport = to_zport(uport);
+	struct zs_scc *scc = zport->scc;
+	struct zs_port *zport_a = &scc->zport[ZS_CHAN_A];
+	u8 oldloop, newloop;
+
+	spin_lock(&scc->zlock);
+	if (zport != zport_a) {
+		if (mctrl & TIOCM_DTR)
+			zport_a->regs[5] |= DTR;
+		else
+			zport_a->regs[5] &= ~DTR;
+		if (mctrl & TIOCM_RTS)
+			zport_a->regs[5] |= RTS;
+		else
+			zport_a->regs[5] &= ~RTS;
+		write_zsreg(zport_a, R5, zport_a->regs[5]);
+	}
+
+	/* Rarely modified, so don't poke at hardware unless necessary. */
+	oldloop = zport->regs[14];
+	newloop = oldloop;
+	if (mctrl & TIOCM_LOOP)
+		newloop |= LOOPBAK;
+	else
+		newloop &= ~LOOPBAK;
+	if (newloop != oldloop) {
+		zport->regs[14] = newloop;
+		write_zsreg(zport, R14, zport->regs[14]);
+	}
+	spin_unlock(&scc->zlock);
+}
+
+static void zs_raw_stop_tx(struct zs_port *zport)
+{
+	write_zsreg(zport, R0, RES_Tx_P);
+	zport->tx_stopped = 1;
+}
+
+static void zs_stop_tx(struct uart_port *uport)
+{
+	struct zs_port *zport = to_zport(uport);
+	struct zs_scc *scc = zport->scc;
+
+	spin_lock(&scc->zlock);
+	zs_raw_stop_tx(zport);
+	spin_unlock(&scc->zlock);
+}
+
+static void zs_raw_transmit_chars(struct zs_port *);
+
+static void zs_start_tx(struct uart_port *uport)
+{
+	struct zs_port *zport = to_zport(uport);
+	struct zs_scc *scc = zport->scc;
+
+	spin_lock(&scc->zlock);
+	if (zport->tx_stopped) {
+		zs_transmit_drain(zport, 0);
+		zport->tx_stopped = 0;
+		zs_raw_transmit_chars(zport);
+	}
+	spin_unlock(&scc->zlock);
+}
+
+static void zs_stop_rx(struct uart_port *uport)
+{
+	struct zs_port *zport = to_zport(uport);
+	struct zs_scc *scc = zport->scc;
+	struct zs_port *zport_a = &scc->zport[ZS_CHAN_A];
+
+	spin_lock(&scc->zlock);
+	zport->regs[15] &= ~BRKIE;
+	zport->regs[1] &= ~(RxINT_MASK | TxINT_ENAB);
+	zport->regs[1] |= RxINT_DISAB;
+
+	if (zport != zport_a) {
+		/* A-side DCD tracks RI and SYNC tracks DSR.  */
+		zport_a->regs[15] &= ~(DCDIE | SYNCIE);
+		write_zsreg(zport_a, R15, zport_a->regs[15]);
+		if (!(zport_a->regs[15] & BRKIE)) {
+			zport_a->regs[1] &= ~EXT_INT_ENAB;
+			write_zsreg(zport_a, R1, zport_a->regs[1]);
+		}
+
+		/* This-side DCD tracks DCD and CTS tracks CTS.  */
+		zport->regs[15] &= ~(DCDIE | CTSIE);
+		zport->regs[1] &= ~EXT_INT_ENAB;
+	} else {
+		/* DCD tracks RI and SYNC tracks DSR for the B side.  */
+		if (!(zport->regs[15] & (DCDIE | SYNCIE)))
+			zport->regs[1] &= ~EXT_INT_ENAB;
+	}
+
+	write_zsreg(zport, R15, zport->regs[15]);
+	write_zsreg(zport, R1, zport->regs[1]);
+	spin_unlock(&scc->zlock);
+}
+
+static void zs_enable_ms(struct uart_port *uport)
+{
+	struct zs_port *zport = to_zport(uport);
+	struct zs_scc *scc = zport->scc;
+	struct zs_port *zport_a = &scc->zport[ZS_CHAN_A];
+
+	if (zport == zport_a)
+		return;
+
+	spin_lock(&scc->zlock);
+
+	/* Clear Ext interrupts if not being handled already.  */
+	if (!(zport_a->regs[1] & EXT_INT_ENAB))
+		write_zsreg(zport_a, R0, RES_EXT_INT);
+
+	/* A-side DCD tracks RI and SYNC tracks DSR.  */
+	zport_a->regs[1] |= EXT_INT_ENAB;
+	zport_a->regs[15] |= DCDIE | SYNCIE;
+
+	/* This-side DCD tracks DCD and CTS tracks CTS.  */
+	zport->regs[15] |= DCDIE | CTSIE;
+
+	zs_raw_xor_mctrl(zport);
+
+	write_zsreg(zport_a, R1, zport_a->regs[1]);
+	write_zsreg(zport_a, R15, zport_a->regs[15]);
+	write_zsreg(zport, R15, zport->regs[15]);
+	spin_unlock(&scc->zlock);
+}
+
+static void zs_break_ctl(struct uart_port *uport, int break_state)
+{
+	struct zs_port *zport = to_zport(uport);
+	struct zs_scc *scc = zport->scc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&scc->zlock, flags);
+	if (break_state == -1)
+		zport->regs[5] |= SND_BRK;
+	else
+		zport->regs[5] &= ~SND_BRK;
+	write_zsreg(zport, R5, zport->regs[5]);
+	spin_unlock_irqrestore(&scc->zlock, flags);
+}
+
+
+/*
+ * Interrupt handling routines.
+ */
+#define Rx_BRK 0x0100			/* BREAK event software flag.  */
+#define Rx_SYS 0x0200			/* SysRq event software flag.  */
+
+static void zs_receive_chars(struct zs_port *zport)
+{
+	struct uart_port *uport = &zport->port;
+	struct zs_scc *scc = zport->scc;
+	struct uart_icount *icount;
+	unsigned int avail, status, ch, flag;
+	int count;
+
+	for (count = 16; count; count--) {
+		spin_lock(&scc->zlock);
+		avail = read_zsreg(zport, R0) & Rx_CH_AV;
+		spin_unlock(&scc->zlock);
+		if (!avail)
+			break;
+
+		spin_lock(&scc->zlock);
+		status = read_zsreg(zport, R1) & (Rx_OVR | FRM_ERR | PAR_ERR);
+		ch = read_zsdata(zport);
+		spin_unlock(&scc->zlock);
+
+		flag = TTY_NORMAL;
+
+		icount = &uport->icount;
+		icount->rx++;
+
+		/* Handle the null char got when BREAK is removed.  */
+		if (!ch)
+			status |= zport->tty_break;
+		if (unlikely(status &
+			     (Rx_OVR | FRM_ERR | PAR_ERR | Rx_SYS | Rx_BRK))) {
+			zport->tty_break = 0;
+
+			/* Reset the error indication.  */
+			if (status & (Rx_OVR | FRM_ERR | PAR_ERR)) {
+				spin_lock(&scc->zlock);
+				write_zsreg(zport, R0, ERR_RES);
+				spin_unlock(&scc->zlock);
+			}
+
+			if (status & (Rx_SYS | Rx_BRK)) {
+				icount->brk++;
+				/* SysRq discards the null char.  */
+				if (status & Rx_SYS)
+					continue;
+			} else if (status & FRM_ERR)
+				icount->frame++;
+			else if (status & PAR_ERR)
+				icount->parity++;
+			if (status & Rx_OVR)
+				icount->overrun++;
+
+			status &= uport->read_status_mask;
+			if (status & Rx_BRK)
+				flag = TTY_BREAK;
+			else if (status & FRM_ERR)
+				flag = TTY_FRAME;
+			else if (status & PAR_ERR)
+				flag = TTY_PARITY;
+		}
+
+		if (uart_handle_sysrq_char(uport, ch))
+			continue;
+
+		uart_insert_char(uport, status, Rx_OVR, ch, flag);
+	}
+
+	tty_flip_buffer_push(uport->info->tty);
+}
+
+static void zs_raw_transmit_chars(struct zs_port *zport)
+{
+	struct circ_buf *xmit = &zport->port.info->xmit;
+
+	/* XON/XOFF chars.  */
+	if (zport->port.x_char) {
+		write_zsdata(zport, zport->port.x_char);
+		zport->port.icount.tx++;
+		zport->port.x_char = 0;
+		return;
+	}
+
+	/* If nothing to do or stopped or hardware stopped.  */
+	if (uart_circ_empty(xmit) || uart_tx_stopped(&zport->port)) {
+		zs_raw_stop_tx(zport);
+		return;
+	}
+
+	/* Send char.  */
+	write_zsdata(zport, xmit->buf[xmit->tail]);
+	xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+	zport->port.icount.tx++;
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(&zport->port);
+
+	/* Are we are done?  */
+	if (uart_circ_empty(xmit))
+		zs_raw_stop_tx(zport);
+}
+
+static void zs_transmit_chars(struct zs_port *zport)
+{
+	struct zs_scc *scc = zport->scc;
+
+	spin_lock(&scc->zlock);
+	zs_raw_transmit_chars(zport);
+	spin_unlock(&scc->zlock);
+}
+
+static void zs_status_handle(struct zs_port *zport, struct zs_port *zport_a)
+{
+	struct uart_port *uport = &zport->port;
+	struct zs_scc *scc = zport->scc;
+	unsigned int delta;
+	u8 status, brk;
+
+	spin_lock(&scc->zlock);
+
+	/* Get status from Read Register 0.  */
+	status = read_zsreg(zport, R0);
+
+	if (zport->regs[15] & BRKIE) {
+		brk = status & BRK_ABRT;
+		if (brk && !zport->brk) {
+			spin_unlock(&scc->zlock);
+			if (uart_handle_break(uport))
+				zport->tty_break = Rx_SYS;
+			else
+				zport->tty_break = Rx_BRK;
+			spin_lock(&scc->zlock);
+		}
+		zport->brk = brk;
+	}
+
+	if (zport != zport_a) {
+		delta = zs_raw_xor_mctrl(zport);
+		spin_unlock(&scc->zlock);
+
+		if (delta & TIOCM_CTS)
+			uart_handle_cts_change(uport,
+					       zport->mctrl & TIOCM_CTS);
+		if (delta & TIOCM_CAR)
+			uart_handle_dcd_change(uport,
+					       zport->mctrl & TIOCM_CAR);
+		if (delta & TIOCM_RNG)
+			uport->icount.dsr++;
+		if (delta & TIOCM_DSR)
+			uport->icount.rng++;
+
+		if (delta)
+			wake_up_interruptible(&uport->info->delta_msr_wait);
+
+		spin_lock(&scc->zlock);
+	}
+
+	/* Clear the status condition...  */
+	write_zsreg(zport, R0, RES_EXT_INT);
+
+	spin_unlock(&scc->zlock);
+}
+
+/*
+ * This is the Z85C30 driver's generic interrupt routine.
+ */
+static irqreturn_t zs_interrupt(int irq, void *dev_id)
+{
+	struct zs_scc *scc = dev_id;
+	struct zs_port *zport_a = &scc->zport[ZS_CHAN_A];
+	struct zs_port *zport_b = &scc->zport[ZS_CHAN_B];
+	irqreturn_t status = IRQ_NONE;
+	u8 zs_intreg;
+	int count;
+
+	/*
+	 * NOTE: The read register 3, which holds the irq status,
+	 *       does so for both channels on each chip.  Although
+	 *       the status value itself must be read from the A
+	 *       channel and is only valid when read from channel A.
+	 *       Yes... broken hardware...
+	 */
+	for (count = 16; count; count--) {
+		spin_lock(&scc->zlock);
+		zs_intreg = read_zsreg(zport_a, R3);
+		spin_unlock(&scc->zlock);
+		if (!zs_intreg)
+			break;
+
+		/*
+		 * We do not like losing characters, so we prioritise
+		 * interrupt sources a little bit differently than
+		 * the SCC would, was it allowed to.
+		 */
+		if (zs_intreg & CHBRxIP)
+			zs_receive_chars(zport_b);
+		if (zs_intreg & CHARxIP)
+			zs_receive_chars(zport_a);
+		if (zs_intreg & CHBEXT)
+			zs_status_handle(zport_b, zport_a);
+		if (zs_intreg & CHAEXT)
+			zs_status_handle(zport_a, zport_a);
+		if (zs_intreg & CHBTxIP)
+			zs_transmit_chars(zport_b);
+		if (zs_intreg & CHATxIP)
+			zs_transmit_chars(zport_a);
+
+		status = IRQ_HANDLED;
+	}
+
+	return status;
+}
+
+
+/*
+ * Finally, routines used to initialize the serial port.
+ */
+static int zs_startup(struct uart_port *uport)
+{
+	struct zs_port *zport = to_zport(uport);
+	struct zs_scc *scc = zport->scc;
+	unsigned long flags;
+	int irq_guard;
+	int ret;
+
+	irq_guard = atomic_add_return(1, &scc->irq_guard);
+	if (irq_guard == 1) {
+		ret = request_irq(zport->port.irq, zs_interrupt,
+				  IRQF_SHARED, "scc", scc);
+		if (ret) {
+			atomic_add(-1, &scc->irq_guard);
+			printk(KERN_ERR "zs: can't get irq %d\n",
+			       zport->port.irq);
+			return ret;
+		}
+	}
+
+	spin_lock_irqsave(&scc->zlock, flags);
+
+	/* Clear the receive FIFO.  */
+	zs_receive_drain(zport);
+
+	/* Clear the interrupt registers.  */
+	write_zsreg(zport, R0, ERR_RES);
+	write_zsreg(zport, R0, RES_Tx_P);
+	/* But Ext only if not being handled already.  */
+	if (!(zport->regs[1] & EXT_INT_ENAB))
+		write_zsreg(zport, R0, RES_EXT_INT);
+
+	/* Finally, enable sequencing and interrupts.  */
+	zport->regs[1] &= ~RxINT_MASK;
+	zport->regs[1] |= RxINT_ALL | TxINT_ENAB | EXT_INT_ENAB;
+	zport->regs[3] |= RxENABLE;
+	zport->regs[5] |= TxENAB;
+	zport->regs[15] |= BRKIE;
+	write_zsreg(zport, R1, zport->regs[1]);
+	write_zsreg(zport, R3, zport->regs[3]);
+	write_zsreg(zport, R5, zport->regs[5]);
+	write_zsreg(zport, R15, zport->regs[15]);
+
+	/* Record the current state of RR0.  */
+	zport->mctrl = zs_raw_get_mctrl(zport);
+	zport->brk = read_zsreg(zport, R0) & BRK_ABRT;
+
+	zport->tx_stopped = 1;
+
+	spin_unlock_irqrestore(&scc->zlock, flags);
+
+	return 0;
+}
+
+static void zs_shutdown(struct uart_port *uport)
+{
+	struct zs_port *zport = to_zport(uport);
+	struct zs_scc *scc = zport->scc;
+	unsigned long flags;
+	int irq_guard;
+
+	spin_lock_irqsave(&scc->zlock, flags);
+
+	zport->regs[5] &= ~TxENAB;
+	zport->regs[3] &= ~RxENABLE;
+	write_zsreg(zport, R5, zport->regs[5]);
+	write_zsreg(zport, R3, zport->regs[3]);
+
+	spin_unlock_irqrestore(&scc->zlock, flags);
+
+	irq_guard = atomic_add_return(-1, &scc->irq_guard);
+	if (!irq_guard)
+		free_irq(zport->port.irq, scc);
+}
+
+
+static void zs_reset(struct zs_port *zport)
+{
+	struct zs_scc *scc = zport->scc;
+	int irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&scc->zlock, flags);
+	irq = !irqs_disabled_flags(flags);
+	if (!scc->initialised) {
+		/* Reset the pointer first, just in case...  */
+		read_zsreg(zport, R0);
+		/* And let the current transmission finish.  */
+		zs_line_drain(zport, irq);
+		write_zsreg(zport, R9, FHWRES);
+		udelay(10);
+		write_zsreg(zport, R9, 0);
+		scc->initialised = 1;
+	}
+	load_zsregs(zport, zport->regs, irq);
+	spin_unlock_irqrestore(&scc->zlock, flags);
+}
+
+static void zs_set_termios(struct uart_port *uport, struct ktermios *termios,
+			   struct ktermios *old_termios)
+{
+	struct zs_port *zport = to_zport(uport);
+	struct zs_scc *scc = zport->scc;
+	struct zs_port *zport_a = &scc->zport[ZS_CHAN_A];
+	int irq;
+	unsigned int baud, brg;
+	unsigned long flags;
+
+	spin_lock_irqsave(&scc->zlock, flags);
+	irq = !irqs_disabled_flags(flags);
+
+	/* Byte size.  */
+	zport->regs[3] &= ~RxNBITS_MASK;
+	zport->regs[5] &= ~TxNBITS_MASK;
+	switch (termios->c_cflag & CSIZE) {
+	case CS5:
+		zport->regs[3] |= Rx5;
+		zport->regs[5] |= Tx5;
+		break;
+	case CS6:
+		zport->regs[3] |= Rx6;
+		zport->regs[5] |= Tx6;
+		break;
+	case CS7:
+		zport->regs[3] |= Rx7;
+		zport->regs[5] |= Tx7;
+		break;
+	case CS8:
+	default:
+		zport->regs[3] |= Rx8;
+		zport->regs[5] |= Tx8;
+		break;
+	}
+
+	/* Parity and stop bits.  */
+	zport->regs[4] &= ~(XCLK_MASK | SB_MASK | PAR_ENA | PAR_EVEN);
+	if (termios->c_cflag & CSTOPB)
+		zport->regs[4] |= SB2;
+	else
+		zport->regs[4] |= SB1;
+	if (termios->c_cflag & PARENB)
+		zport->regs[4] |= PAR_ENA;
+	if (!(termios->c_cflag & PARODD))
+		zport->regs[4] |= PAR_EVEN;
+	switch (zport->clk_mode) {
+	case 64:
+		zport->regs[4] |= X64CLK;
+		break;
+	case 32:
+		zport->regs[4] |= X32CLK;
+		break;
+	case 16:
+		zport->regs[4] |= X16CLK;
+		break;
+	case 1:
+		zport->regs[4] |= X1CLK;
+		break;
+	default:
+		BUG();
+	}
+
+	baud = uart_get_baud_rate(uport, termios, old_termios, 0,
+				  uport->uartclk / zport->clk_mode / 4);
+
+	brg = ZS_BPS_TO_BRG(baud, uport->uartclk / zport->clk_mode);
+	zport->regs[12] = brg & 0xff;
+	zport->regs[13] = (brg >> 8) & 0xff;
+
+	uart_update_timeout(uport, termios->c_cflag, baud);
+
+	uport->read_status_mask = Rx_OVR;
+	if (termios->c_iflag & INPCK)
+		uport->read_status_mask |= FRM_ERR | PAR_ERR;
+	if (termios->c_iflag & (BRKINT | PARMRK))
+		uport->read_status_mask |= Rx_BRK;
+
+	uport->ignore_status_mask = 0;
+	if (termios->c_iflag & IGNPAR)
+		uport->ignore_status_mask |= FRM_ERR | PAR_ERR;
+	if (termios->c_iflag & IGNBRK) {
+		uport->ignore_status_mask |= Rx_BRK;
+		if (termios->c_iflag & IGNPAR)
+			uport->ignore_status_mask |= Rx_OVR;
+	}
+
+	if (termios->c_cflag & CREAD)
+		zport->regs[3] |= RxENABLE;
+	else
+		zport->regs[3] &= ~RxENABLE;
+
+	if (zport != zport_a) {
+		if (!(termios->c_cflag & CLOCAL)) {
+			zport->regs[15] |= DCDIE;
+		} else
+			zport->regs[15] &= ~DCDIE;
+		if (termios->c_cflag & CRTSCTS) {
+			zport->regs[15] |= CTSIE;
+		} else
+			zport->regs[15] &= ~CTSIE;
+		zs_raw_xor_mctrl(zport);
+	}
+
+	/* Load up the new values.  */
+	load_zsregs(zport, zport->regs, irq);
+
+	spin_unlock_irqrestore(&scc->zlock, flags);
+}
+
+
+static const char *zs_type(struct uart_port *uport)
+{
+	return "Z85C30 SCC";
+}
+
+static void zs_release_port(struct uart_port *uport)
+{
+	iounmap(uport->membase);
+	uport->membase = 0;
+	release_mem_region(uport->mapbase, ZS_CHAN_IO_SIZE);
+}
+
+static int zs_map_port(struct uart_port *uport)
+{
+	if (!uport->membase)
+		uport->membase = ioremap_nocache(uport->mapbase,
+						 ZS_CHAN_IO_SIZE);
+	if (!uport->membase) {
+		printk(KERN_ERR "zs: Cannot map MMIO\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static int zs_request_port(struct uart_port *uport)
+{
+	int ret;
+
+	if (!request_mem_region(uport->mapbase, ZS_CHAN_IO_SIZE, "scc")) {
+		printk(KERN_ERR "zs: Unable to reserve MMIO resource\n");
+		return -EBUSY;
+	}
+	ret = zs_map_port(uport);
+	if (ret) {
+		release_mem_region(uport->mapbase, ZS_CHAN_IO_SIZE);
+		return ret;
+	}
+	return 0;
+}
+
+static void zs_config_port(struct uart_port *uport, int flags)
+{
+	struct zs_port *zport = to_zport(uport);
+
+	if (flags & UART_CONFIG_TYPE) {
+		if (zs_request_port(uport))
+			return;
+
+		uport->type = PORT_ZS;
+
+		zs_reset(zport);
+	}
+}
+
+static int zs_verify_port(struct uart_port *uport, struct serial_struct *ser)
+{
+	struct zs_port *zport = to_zport(uport);
+	int ret = 0;
+
+	if (ser->type != PORT_UNKNOWN && ser->type != PORT_ZS)
+		ret = -EINVAL;
+	if (ser->irq != uport->irq)
+		ret = -EINVAL;
+	if (ser->baud_base != uport->uartclk / zport->clk_mode / 4)
+		ret = -EINVAL;
+	return ret;
+}
+
+
+static struct uart_ops zs_ops = {
+	.tx_empty	= zs_tx_empty,
+	.set_mctrl	= zs_set_mctrl,
+	.get_mctrl	= zs_get_mctrl,
+	.stop_tx	= zs_stop_tx,
+	.start_tx	= zs_start_tx,
+	.stop_rx	= zs_stop_rx,
+	.enable_ms	= zs_enable_ms,
+	.break_ctl	= zs_break_ctl,
+	.startup	= zs_startup,
+	.shutdown	= zs_shutdown,
+	.set_termios	= zs_set_termios,
+	.type		= zs_type,
+	.release_port	= zs_release_port,
+	.request_port	= zs_request_port,
+	.config_port	= zs_config_port,
+	.verify_port	= zs_verify_port,
+};
+
+/*
+ * Initialize Z85C30 port structures.
+ */
+static int __init zs_probe_sccs(void)
+{
+	static int probed;
+	struct zs_parms zs_parms;
+	int chip, side, irq;
+	int n_chips = 0;
+	int i;
+
+	if (probed)
+		return 0;
+
+	irq = dec_interrupt[DEC_IRQ_SCC0];
+	if (irq >= 0) {
+		zs_parms.scc[n_chips] = IOASIC_SCC0;
+		zs_parms.irq[n_chips] = dec_interrupt[DEC_IRQ_SCC0];
+		n_chips++;
+	}
+	irq = dec_interrupt[DEC_IRQ_SCC1];
+	if (irq >= 0) {
+		zs_parms.scc[n_chips] = IOASIC_SCC1;
+		zs_parms.irq[n_chips] = dec_interrupt[DEC_IRQ_SCC1];
+		n_chips++;
+	}
+	if (!n_chips)
+		return -ENXIO;
+
+	probed = 1;
+
+	for (chip = 0; chip < n_chips; chip++) {
+		spin_lock_init(&zs_sccs[chip].zlock);
+		for (side = 0; side < ZS_NUM_CHAN; side++) {
+			struct zs_port *zport = &zs_sccs[chip].zport[side];
+			struct uart_port *uport = &zport->port;
+
+			zport->scc	= &zs_sccs[chip];
+			zport->clk_mode	= 16;
+
+			uport->irq	= zs_parms.irq[chip];
+			uport->uartclk	= ZS_CLOCK;
+			uport->fifosize	= 1;
+			uport->iotype	= UPIO_MEM;
+			uport->flags	= UPF_BOOT_AUTOCONF;
+			uport->ops	= &zs_ops;
+			uport->line	= chip * ZS_NUM_CHAN + side;
+			uport->mapbase	= dec_kn_slot_base +
+					  zs_parms.scc[chip] +
+					  (side ^ ZS_CHAN_B) * ZS_CHAN_IO_SIZE;
+
+			for (i = 0; i < ZS_NUM_REGS; i++)
+				zport->regs[i] = zs_init_regs[i];
+		}
+	}
+
+	return 0;
+}
+
+
+#ifdef CONFIG_SERIAL_ZS_CONSOLE
+static void zs_console_putchar(struct uart_port *uport, int ch)
+{
+	struct zs_port *zport = to_zport(uport);
+	struct zs_scc *scc = zport->scc;
+	int irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&scc->zlock, flags);
+	irq = !irqs_disabled_flags(flags);
+	if (zs_transmit_drain(zport, irq))
+		write_zsdata(zport, ch);
+	spin_unlock_irqrestore(&scc->zlock, flags);
+}
+
+/*
+ * Print a string to the serial port trying not to disturb
+ * any possible real use of the port...
+ */
+static void zs_console_write(struct console *co, const char *s,
+			     unsigned int count)
+{
+	int chip = co->index / ZS_NUM_CHAN, side = co->index % ZS_NUM_CHAN;
+	struct zs_port *zport = &zs_sccs[chip].zport[side];
+	struct zs_scc *scc = zport->scc;
+	unsigned long flags;
+	u8 txint, txenb;
+	int irq;
+
+	/* Disable transmit interrupts and enable the transmitter. */
+	spin_lock_irqsave(&scc->zlock, flags);
+	txint = zport->regs[1];
+	txenb = zport->regs[5];
+	if (txint & TxINT_ENAB) {
+		zport->regs[1] = txint & ~TxINT_ENAB;
+		write_zsreg(zport, R1, zport->regs[1]);
+	}
+	if (!(txenb & TxENAB)) {
+		zport->regs[5] = txenb | TxENAB;
+		write_zsreg(zport, R5, zport->regs[5]);
+	}
+	spin_unlock_irqrestore(&scc->zlock, flags);
+
+	uart_console_write(&zport->port, s, count, zs_console_putchar);
+
+	/* Restore transmit interrupts and the transmitter enable. */
+	spin_lock_irqsave(&scc->zlock, flags);
+	irq = !irqs_disabled_flags(flags);
+	zs_line_drain(zport, irq);
+	if (!(txenb & TxENAB)) {
+		zport->regs[5] &= ~TxENAB;
+		write_zsreg(zport, R5, zport->regs[5]);
+	}
+	if (txint & TxINT_ENAB) {
+		zport->regs[1] |= TxINT_ENAB;
+		write_zsreg(zport, R1, zport->regs[1]);
+	}
+	spin_unlock_irqrestore(&scc->zlock, flags);
+}
+
+/*
+ * Setup serial console baud/bits/parity.  We do two things here:
+ * - construct a cflag setting for the first uart_open()
+ * - initialise the serial port
+ * Return non-zero if we didn't find a serial port.
+ */
+static int __init zs_console_setup(struct console *co, char *options)
+{
+	int chip = co->index / ZS_NUM_CHAN, side = co->index % ZS_NUM_CHAN;
+	struct zs_port *zport = &zs_sccs[chip].zport[side];
+	struct uart_port *uport = &zport->port;
+	int baud = 9600;
+	int bits = 8;
+	int parity = 'n';
+	int flow = 'n';
+	int ret;
+
+	ret = zs_map_port(uport);
+	if (ret)
+		return ret;
+
+	zs_reset(zport);
+
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+	return uart_set_options(uport, co, baud, parity, bits, flow);
+}
+
+static struct uart_driver zs_reg;
+static struct console zs_console = {
+	.name	= "ttyS",
+	.write	= zs_console_write,
+	.device	= uart_console_device,
+	.setup	= zs_console_setup,
+	.flags	= CON_PRINTBUFFER,
+	.index	= -1,
+	.data	= &zs_reg,
+};
+
+/*
+ *	Register console.
+ */
+static int __init zs_serial_console_init(void)
+{
+	int ret;
+
+	ret = zs_probe_sccs();
+	if (ret)
+		return ret;
+	register_console(&zs_console);
+
+	return 0;
+}
+
+console_initcall(zs_serial_console_init);
+
+#define SERIAL_ZS_CONSOLE	&zs_console
+#else
+#define SERIAL_ZS_CONSOLE	NULL
+#endif /* CONFIG_SERIAL_ZS_CONSOLE */
+
+static struct uart_driver zs_reg = {
+	.owner			= THIS_MODULE,
+	.driver_name		= "serial",
+	.dev_name		= "ttyS",
+	.major			= TTY_MAJOR,
+	.minor			= 64,
+	.nr			= ZS_NUM_SCCS * ZS_NUM_CHAN,
+	.cons			= SERIAL_ZS_CONSOLE,
+};
+
+/* zs_init inits the driver. */
+static int __init zs_init(void)
+{
+	int i, ret;
+
+	pr_info("%s%s\n", zs_name, zs_version);
+
+	/* Find out how many Z85C30 SCCs we have.  */
+	ret = zs_probe_sccs();
+	if (ret)
+		return ret;
+
+	ret = uart_register_driver(&zs_reg);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < ZS_NUM_SCCS * ZS_NUM_CHAN; i++) {
+		struct zs_scc *scc = &zs_sccs[i / ZS_NUM_CHAN];
+		struct zs_port *zport = &scc->zport[i % ZS_NUM_CHAN];
+		struct uart_port *uport = &zport->port;
+
+		if (zport->scc)
+			uart_add_one_port(&zs_reg, uport);
+	}
+
+	return 0;
+}
+
+static void __exit zs_exit(void)
+{
+	int i;
+
+	for (i = ZS_NUM_SCCS * ZS_NUM_CHAN - 1; i >= 0; i--) {
+		struct zs_scc *scc = &zs_sccs[i / ZS_NUM_CHAN];
+		struct zs_port *zport = &scc->zport[i % ZS_NUM_CHAN];
+		struct uart_port *uport = &zport->port;
+
+		if (zport->scc)
+			uart_remove_one_port(&zs_reg, uport);
+	}
+
+	uart_unregister_driver(&zs_reg);
+}
+
+module_init(zs_init);
+module_exit(zs_exit);
diff --git a/drivers/serial/zs.h b/drivers/serial/zs.h
new file mode 100644
index 000000000000..aa921b57d827
--- /dev/null
+++ b/drivers/serial/zs.h
@@ -0,0 +1,284 @@
+/*
+ * zs.h: Definitions for the DECstation Z85C30 serial driver.
+ *
+ * Adapted from drivers/sbus/char/sunserial.h by Paul Mackerras.
+ * Adapted from drivers/macintosh/macserial.h by Harald Koerfgen.
+ *
+ * Copyright (C) 1996 Paul Mackerras (Paul.Mackerras@cs.anu.edu.au)
+ * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 2004, 2005, 2007  Maciej W. Rozycki
+ */
+#ifndef _SERIAL_ZS_H
+#define _SERIAL_ZS_H
+
+#ifdef __KERNEL__
+
+#define ZS_NUM_REGS 16
+
+/*
+ * This is our internal structure for each serial port's state.
+ */
+struct zs_port {
+	struct zs_scc	*scc;			/* Containing SCC.  */
+	struct uart_port port;			/* Underlying UART.  */
+
+	int		clk_mode;		/* May be 1, 16, 32, or 64.  */
+
+	unsigned int	tty_break;		/* Set on BREAK condition.  */
+	int		tx_stopped;		/* Output is suspended.  */
+
+	unsigned int	mctrl;			/* State of modem lines.  */
+	u8		brk;			/* BREAK state from RR0.  */
+
+	u8		regs[ZS_NUM_REGS];	/* Channel write registers.  */
+};
+
+/*
+ * Per-SCC state for locking and the interrupt handler.
+ */
+struct zs_scc {
+	struct zs_port	zport[2];
+	spinlock_t	zlock;
+	atomic_t	irq_guard;
+	int		initialised;
+};
+
+#endif /* __KERNEL__ */
+
+/*
+ * Conversion routines to/from brg time constants from/to bits per second.
+ */
+#define ZS_BRG_TO_BPS(brg, freq) ((freq) / 2 / ((brg) + 2))
+#define ZS_BPS_TO_BRG(bps, freq) ((((freq) + (bps)) / (2 * (bps))) - 2)
+
+/*
+ * The Zilog register set.
+ */
+
+/* Write Register 0 (Command) */
+#define R0		0	/* Register selects */
+#define R1		1
+#define R2		2
+#define R3		3
+#define R4		4
+#define R5		5
+#define R6		6
+#define R7		7
+#define R8		8
+#define R9		9
+#define R10		10
+#define R11		11
+#define R12		12
+#define R13		13
+#define R14		14
+#define R15		15
+
+#define NULLCODE	0	/* Null Code */
+#define POINT_HIGH	0x8	/* Select upper half of registers */
+#define RES_EXT_INT	0x10	/* Reset Ext. Status Interrupts */
+#define SEND_ABORT	0x18	/* HDLC Abort */
+#define RES_RxINT_FC	0x20	/* Reset RxINT on First Character */
+#define RES_Tx_P	0x28	/* Reset TxINT Pending */
+#define ERR_RES		0x30	/* Error Reset */
+#define RES_H_IUS	0x38	/* Reset highest IUS */
+
+#define RES_Rx_CRC	0x40	/* Reset Rx CRC Checker */
+#define RES_Tx_CRC	0x80	/* Reset Tx CRC Checker */
+#define RES_EOM_L	0xC0	/* Reset EOM latch */
+
+/* Write Register 1 (Tx/Rx/Ext Int Enable and WAIT/DMA Commands) */
+#define EXT_INT_ENAB	0x1	/* Ext Int Enable */
+#define TxINT_ENAB	0x2	/* Tx Int Enable */
+#define PAR_SPEC	0x4	/* Parity is special condition */
+
+#define RxINT_DISAB	0	/* Rx Int Disable */
+#define RxINT_FCERR	0x8	/* Rx Int on First Character Only or Error */
+#define RxINT_ALL	0x10	/* Int on all Rx Characters or error */
+#define RxINT_ERR	0x18	/* Int on error only */
+#define RxINT_MASK	0x18
+
+#define WT_RDY_RT	0x20	/* Wait/Ready on R/T */
+#define WT_FN_RDYFN	0x40	/* Wait/FN/Ready FN */
+#define WT_RDY_ENAB	0x80	/* Wait/Ready Enable */
+
+/* Write Register 2 (Interrupt Vector) */
+
+/* Write Register 3 (Receive Parameters and Control) */
+#define RxENABLE	0x1	/* Rx Enable */
+#define SYNC_L_INH	0x2	/* Sync Character Load Inhibit */
+#define ADD_SM		0x4	/* Address Search Mode (SDLC) */
+#define RxCRC_ENAB	0x8	/* Rx CRC Enable */
+#define ENT_HM		0x10	/* Enter Hunt Mode */
+#define AUTO_ENAB	0x20	/* Auto Enables */
+#define Rx5		0x0	/* Rx 5 Bits/Character */
+#define Rx7		0x40	/* Rx 7 Bits/Character */
+#define Rx6		0x80	/* Rx 6 Bits/Character */
+#define Rx8		0xc0	/* Rx 8 Bits/Character */
+#define RxNBITS_MASK	0xc0
+
+/* Write Register 4 (Transmit/Receive Miscellaneous Parameters and Modes) */
+#define PAR_ENA		0x1	/* Parity Enable */
+#define PAR_EVEN	0x2	/* Parity Even/Odd* */
+
+#define SYNC_ENAB	0	/* Sync Modes Enable */
+#define SB1		0x4	/* 1 stop bit/char */
+#define SB15		0x8	/* 1.5 stop bits/char */
+#define SB2		0xc	/* 2 stop bits/char */
+#define SB_MASK		0xc
+
+#define MONSYNC		0	/* 8 Bit Sync character */
+#define BISYNC		0x10	/* 16 bit sync character */
+#define SDLC		0x20	/* SDLC Mode (01111110 Sync Flag) */
+#define EXTSYNC		0x30	/* External Sync Mode */
+
+#define X1CLK		0x0	/* x1 clock mode */
+#define X16CLK		0x40	/* x16 clock mode */
+#define X32CLK		0x80	/* x32 clock mode */
+#define X64CLK		0xc0	/* x64 clock mode */
+#define XCLK_MASK	0xc0
+
+/* Write Register 5 (Transmit Parameters and Controls) */
+#define TxCRC_ENAB	0x1	/* Tx CRC Enable */
+#define RTS		0x2	/* RTS */
+#define SDLC_CRC	0x4	/* SDLC/CRC-16 */
+#define TxENAB		0x8	/* Tx Enable */
+#define SND_BRK		0x10	/* Send Break */
+#define Tx5		0x0	/* Tx 5 bits (or less)/character */
+#define Tx7		0x20	/* Tx 7 bits/character */
+#define Tx6		0x40	/* Tx 6 bits/character */
+#define Tx8		0x60	/* Tx 8 bits/character */
+#define TxNBITS_MASK	0x60
+#define DTR		0x80	/* DTR */
+
+/* Write Register 6 (Sync bits 0-7/SDLC Address Field) */
+
+/* Write Register 7 (Sync bits 8-15/SDLC 01111110) */
+
+/* Write Register 8 (Transmit Buffer) */
+
+/* Write Register 9 (Master Interrupt Control) */
+#define VIS		1	/* Vector Includes Status */
+#define NV		2	/* No Vector */
+#define DLC		4	/* Disable Lower Chain */
+#define MIE		8	/* Master Interrupt Enable */
+#define STATHI		0x10	/* Status high */
+#define SOFTACK		0x20	/* Software Interrupt Acknowledge */
+#define NORESET		0	/* No reset on write to R9 */
+#define CHRB		0x40	/* Reset channel B */
+#define CHRA		0x80	/* Reset channel A */
+#define FHWRES		0xc0	/* Force hardware reset */
+
+/* Write Register 10 (Miscellaneous Transmitter/Receiver Control Bits) */
+#define BIT6		1	/* 6 bit/8bit sync */
+#define LOOPMODE	2	/* SDLC Loop mode */
+#define ABUNDER		4	/* Abort/flag on SDLC xmit underrun */
+#define MARKIDLE	8	/* Mark/flag on idle */
+#define GAOP		0x10	/* Go active on poll */
+#define NRZ		0	/* NRZ mode */
+#define NRZI		0x20	/* NRZI mode */
+#define FM1		0x40	/* FM1 (transition = 1) */
+#define FM0		0x60	/* FM0 (transition = 0) */
+#define CRCPS		0x80	/* CRC Preset I/O */
+
+/* Write Register 11 (Clock Mode Control) */
+#define TRxCXT		0	/* TRxC = Xtal output */
+#define TRxCTC		1	/* TRxC = Transmit clock */
+#define TRxCBR		2	/* TRxC = BR Generator Output */
+#define TRxCDP		3	/* TRxC = DPLL output */
+#define TRxCOI		4	/* TRxC O/I */
+#define TCRTxCP		0	/* Transmit clock = RTxC pin */
+#define TCTRxCP		8	/* Transmit clock = TRxC pin */
+#define TCBR		0x10	/* Transmit clock = BR Generator output */
+#define TCDPLL		0x18	/* Transmit clock = DPLL output */
+#define RCRTxCP		0	/* Receive clock = RTxC pin */
+#define RCTRxCP		0x20	/* Receive clock = TRxC pin */
+#define RCBR		0x40	/* Receive clock = BR Generator output */
+#define RCDPLL		0x60	/* Receive clock = DPLL output */
+#define RTxCX		0x80	/* RTxC Xtal/No Xtal */
+
+/* Write Register 12 (Lower Byte of Baud Rate Generator Time Constant) */
+
+/* Write Register 13 (Upper Byte of Baud Rate Generator Time Constant) */
+
+/* Write Register 14 (Miscellaneous Control Bits) */
+#define BRENABL		1	/* Baud rate generator enable */
+#define BRSRC		2	/* Baud rate generator source */
+#define DTRREQ		4	/* DTR/Request function */
+#define AUTOECHO	8	/* Auto Echo */
+#define LOOPBAK		0x10	/* Local loopback */
+#define SEARCH		0x20	/* Enter search mode */
+#define RMC		0x40	/* Reset missing clock */
+#define DISDPLL		0x60	/* Disable DPLL */
+#define SSBR		0x80	/* Set DPLL source = BR generator */
+#define SSRTxC		0xa0	/* Set DPLL source = RTxC */
+#define SFMM		0xc0	/* Set FM mode */
+#define SNRZI		0xe0	/* Set NRZI mode */
+
+/* Write Register 15 (External/Status Interrupt Control) */
+#define WR7P_EN		1	/* WR7 Prime SDLC Feature Enable */
+#define ZCIE		2	/* Zero count IE */
+#define DCDIE		8	/* DCD IE */
+#define SYNCIE		0x10	/* Sync/hunt IE */
+#define CTSIE		0x20	/* CTS IE */
+#define TxUIE		0x40	/* Tx Underrun/EOM IE */
+#define BRKIE		0x80	/* Break/Abort IE */
+
+
+/* Read Register 0 (Transmit/Receive Buffer Status and External Status) */
+#define Rx_CH_AV	0x1	/* Rx Character Available */
+#define ZCOUNT		0x2	/* Zero count */
+#define Tx_BUF_EMP	0x4	/* Tx Buffer empty */
+#define DCD		0x8	/* DCD */
+#define SYNC_HUNT	0x10	/* Sync/hunt */
+#define CTS		0x20	/* CTS */
+#define TxEOM		0x40	/* Tx underrun */
+#define BRK_ABRT	0x80	/* Break/Abort */
+
+/* Read Register 1 (Special Receive Condition Status) */
+#define ALL_SNT		0x1	/* All sent */
+/* Residue Data for 8 Rx bits/char programmed */
+#define RES3		0x8	/* 0/3 */
+#define RES4		0x4	/* 0/4 */
+#define RES5		0xc	/* 0/5 */
+#define RES6		0x2	/* 0/6 */
+#define RES7		0xa	/* 0/7 */
+#define RES8		0x6	/* 0/8 */
+#define RES18		0xe	/* 1/8 */
+#define RES28		0x0	/* 2/8 */
+/* Special Rx Condition Interrupts */
+#define PAR_ERR		0x10	/* Parity Error */
+#define Rx_OVR		0x20	/* Rx Overrun Error */
+#define FRM_ERR		0x40	/* CRC/Framing Error */
+#define END_FR		0x80	/* End of Frame (SDLC) */
+
+/* Read Register 2 (Interrupt Vector (WR2) -- channel A).  */
+
+/* Read Register 2 (Modified Interrupt Vector -- channel B).  */
+
+/* Read Register 3 (Interrupt Pending Bits -- channel A only).  */
+#define CHBEXT		0x1	/* Channel B Ext/Stat IP */
+#define CHBTxIP		0x2	/* Channel B Tx IP */
+#define CHBRxIP		0x4	/* Channel B Rx IP */
+#define CHAEXT		0x8	/* Channel A Ext/Stat IP */
+#define CHATxIP		0x10	/* Channel A Tx IP */
+#define CHARxIP		0x20	/* Channel A Rx IP */
+
+/* Read Register 6 (SDLC FIFO Status and Byte Count LSB) */
+
+/* Read Register 7 (SDLC FIFO Status and Byte Count MSB) */
+
+/* Read Register 8 (Receive Data) */
+
+/* Read Register 10 (Miscellaneous Status Bits) */
+#define ONLOOP		2	/* On loop */
+#define LOOPSEND	0x10	/* Loop sending */
+#define CLK2MIS		0x40	/* Two clocks missing */
+#define CLK1MIS		0x80	/* One clock missing */
+
+/* Read Register 12 (Lower Byte of Baud Rate Generator Constant (WR12)) */
+
+/* Read Register 13 (Upper Byte of Baud Rate Generator Constant (WR13) */
+
+/* Read Register 15 (External/Status Interrupt Control (WR15)) */
+
+#endif /* _SERIAL_ZS_H */
diff --git a/drivers/tc/Makefile b/drivers/tc/Makefile
index 967342692211..c899246bd362 100644
--- a/drivers/tc/Makefile
+++ b/drivers/tc/Makefile
@@ -5,7 +5,6 @@
 # Object file lists.
 
 obj-$(CONFIG_TC) += tc.o tc-driver.o
-obj-$(CONFIG_ZS) += zs.o
 obj-$(CONFIG_VT) += lk201.o lk201-map.o lk201-remap.o
 
 $(obj)/lk201-map.o: $(obj)/lk201-map.c
diff --git a/drivers/tc/zs.c b/drivers/tc/zs.c
deleted file mode 100644
index ed979f13908a..000000000000
--- a/drivers/tc/zs.c
+++ /dev/null
@@ -1,2203 +0,0 @@
-/*
- * decserial.c: Serial port driver for IOASIC DECstations.
- *
- * Derived from drivers/sbus/char/sunserial.c by Paul Mackerras.
- * Derived from drivers/macintosh/macserial.c by Harald Koerfgen.
- *
- * DECstation changes
- * Copyright (C) 1998-2000 Harald Koerfgen
- * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005  Maciej W. Rozycki
- *
- * For the rest of the code the original Copyright applies:
- * Copyright (C) 1996 Paul Mackerras (Paul.Mackerras@cs.anu.edu.au)
- * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
- *
- *
- * Note: for IOASIC systems the wiring is as follows:
- *
- * mouse/keyboard:
- * DIN-7 MJ-4  signal        SCC
- * 2     1     TxD       <-  A.TxD
- * 3     4     RxD       ->  A.RxD
- *
- * EIA-232/EIA-423:
- * DB-25 MMJ-6 signal        SCC
- * 2     2     TxD       <-  B.TxD
- * 3     5     RxD       ->  B.RxD
- * 4           RTS       <- ~A.RTS
- * 5           CTS       -> ~B.CTS
- * 6     6     DSR       -> ~A.SYNC
- * 8           CD        -> ~B.DCD
- * 12          DSRS(DCE) -> ~A.CTS  (*)
- * 15          TxC       ->  B.TxC
- * 17          RxC       ->  B.RxC
- * 20    1     DTR       <- ~A.DTR
- * 22          RI        -> ~A.DCD
- * 23          DSRS(DTE) <- ~B.RTS
- *
- * (*) EIA-232 defines the signal at this pin to be SCD, while DSRS(DCE)
- *     is shared with DSRS(DTE) at pin 23.
- */
-
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/interrupt.h>
-#include <linux/tty.h>
-#include <linux/tty_flip.h>
-#include <linux/major.h>
-#include <linux/string.h>
-#include <linux/fcntl.h>
-#include <linux/mm.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/ioport.h>
-#include <linux/spinlock.h>
-#ifdef CONFIG_SERIAL_DEC_CONSOLE
-#include <linux/console.h>
-#endif
-
-#include <asm/io.h>
-#include <asm/pgtable.h>
-#include <asm/irq.h>
-#include <asm/system.h>
-#include <asm/bootinfo.h>
-
-#include <asm/dec/interrupts.h>
-#include <asm/dec/ioasic_addrs.h>
-#include <asm/dec/machtype.h>
-#include <asm/dec/serial.h>
-#include <asm/dec/system.h>
-
-#ifdef CONFIG_KGDB
-#include <asm/kgdb.h>
-#endif
-#ifdef CONFIG_MAGIC_SYSRQ
-#include <linux/sysrq.h>
-#endif
-
-#include "zs.h"
-
-/*
- * It would be nice to dynamically allocate everything that
- * depends on NUM_SERIAL, so we could support any number of
- * Z8530s, but for now...
- */
-#define NUM_SERIAL	2		/* Max number of ZS chips supported */
-#define NUM_CHANNELS	(NUM_SERIAL * 2)	/* 2 channels per chip */
-#define CHANNEL_A_NR  (zs_parms->channel_a_offset > zs_parms->channel_b_offset)
-                                        /* Number of channel A in the chip */
-#define ZS_CHAN_IO_SIZE 8
-#define ZS_CLOCK        7372800 	/* Z8530 RTxC input clock rate */
-
-#define RECOVERY_DELAY  udelay(2)
-
-struct zs_parms {
-	unsigned long scc0;
-	unsigned long scc1;
-	int channel_a_offset;
-	int channel_b_offset;
-	int irq0;
-	int irq1;
-	int clock;
-};
-
-static struct zs_parms *zs_parms;
-
-#ifdef CONFIG_MACH_DECSTATION
-static struct zs_parms ds_parms = {
-	scc0 : IOASIC_SCC0,
-	scc1 : IOASIC_SCC1,
-	channel_a_offset : 1,
-	channel_b_offset : 9,
-	irq0 : -1,
-	irq1 : -1,
-	clock : ZS_CLOCK
-};
-#endif
-
-#ifdef CONFIG_MACH_DECSTATION
-#define DS_BUS_PRESENT (IOASIC)
-#else
-#define DS_BUS_PRESENT 0
-#endif
-
-#define BUS_PRESENT (DS_BUS_PRESENT)
-
-DEFINE_SPINLOCK(zs_lock);
-
-struct dec_zschannel zs_channels[NUM_CHANNELS];
-struct dec_serial zs_soft[NUM_CHANNELS];
-int zs_channels_found;
-struct dec_serial *zs_chain;	/* list of all channels */
-
-struct tty_struct zs_ttys[NUM_CHANNELS];
-
-#ifdef CONFIG_SERIAL_DEC_CONSOLE
-static struct console zs_console;
-#endif
-#if defined(CONFIG_SERIAL_DEC_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ) && \
-   !defined(MODULE)
-static unsigned long break_pressed; /* break, really ... */
-#endif
-
-static unsigned char zs_init_regs[16] __initdata = {
-	0,				/* write 0 */
-	0,				/* write 1 */
-	0,				/* write 2 */
-	0,				/* write 3 */
-	(X16CLK),			/* write 4 */
-	0,				/* write 5 */
-	0, 0, 0,			/* write 6, 7, 8 */
-	(MIE | DLC | NV),		/* write 9 */
-	(NRZ),				/* write 10 */
-	(TCBR | RCBR),			/* write 11 */
-	0, 0,				/* BRG time constant, write 12 + 13 */
-	(BRSRC | BRENABL),		/* write 14 */
-	0				/* write 15 */
-};
-
-static struct tty_driver *serial_driver;
-
-/* serial subtype definitions */
-#define SERIAL_TYPE_NORMAL	1
-
-/* number of characters left in xmit buffer before we ask for more */
-#define WAKEUP_CHARS 256
-
-/*
- * Debugging.
- */
-#undef SERIAL_DEBUG_OPEN
-#undef SERIAL_DEBUG_FLOW
-#undef SERIAL_DEBUG_THROTTLE
-#undef SERIAL_PARANOIA_CHECK
-
-#undef ZS_DEBUG_REGS
-
-#ifdef SERIAL_DEBUG_THROTTLE
-#define _tty_name(tty,buf) tty_name(tty,buf)
-#endif
-
-#define RS_STROBE_TIME 10
-#define RS_ISR_PASS_LIMIT 256
-
-static void probe_sccs(void);
-static void change_speed(struct dec_serial *info);
-static void rs_wait_until_sent(struct tty_struct *tty, int timeout);
-
-static inline int serial_paranoia_check(struct dec_serial *info,
-					char *name, const char *routine)
-{
-#ifdef SERIAL_PARANOIA_CHECK
-	static const char *badmagic =
-		"Warning: bad magic number for serial struct %s in %s\n";
-	static const char *badinfo =
-		"Warning: null mac_serial for %s in %s\n";
-
-	if (!info) {
-		printk(badinfo, name, routine);
-		return 1;
-	}
-	if (info->magic != SERIAL_MAGIC) {
-		printk(badmagic, name, routine);
-		return 1;
-	}
-#endif
-	return 0;
-}
-
-/*
- * This is used to figure out the divisor speeds and the timeouts
- */
-static int baud_table[] = {
-	0, 50, 75, 110, 134, 150, 200, 300, 600, 1200, 1800, 2400, 4800,
-	9600, 19200, 38400, 57600, 115200, 0 };
-
-/*
- * Reading and writing Z8530 registers.
- */
-static inline unsigned char read_zsreg(struct dec_zschannel *channel,
-				       unsigned char reg)
-{
-	unsigned char retval;
-
-	if (reg != 0) {
-		*channel->control = reg & 0xf;
-		fast_iob(); RECOVERY_DELAY;
-	}
-	retval = *channel->control;
-	RECOVERY_DELAY;
-	return retval;
-}
-
-static inline void write_zsreg(struct dec_zschannel *channel,
-			       unsigned char reg, unsigned char value)
-{
-	if (reg != 0) {
-		*channel->control = reg & 0xf;
-		fast_iob(); RECOVERY_DELAY;
-	}
-	*channel->control = value;
-	fast_iob(); RECOVERY_DELAY;
-	return;
-}
-
-static inline unsigned char read_zsdata(struct dec_zschannel *channel)
-{
-	unsigned char retval;
-
-	retval = *channel->data;
-	RECOVERY_DELAY;
-	return retval;
-}
-
-static inline void write_zsdata(struct dec_zschannel *channel,
-				unsigned char value)
-{
-	*channel->data = value;
-	fast_iob(); RECOVERY_DELAY;
-	return;
-}
-
-static inline void load_zsregs(struct dec_zschannel *channel,
-			       unsigned char *regs)
-{
-/*	ZS_CLEARERR(channel);
-	ZS_CLEARFIFO(channel); */
-	/* Load 'em up */
-	write_zsreg(channel, R3, regs[R3] & ~RxENABLE);
-	write_zsreg(channel, R5, regs[R5] & ~TxENAB);
-	write_zsreg(channel, R4, regs[R4]);
-	write_zsreg(channel, R9, regs[R9]);
-	write_zsreg(channel, R1, regs[R1]);
-	write_zsreg(channel, R2, regs[R2]);
-	write_zsreg(channel, R10, regs[R10]);
-	write_zsreg(channel, R11, regs[R11]);
-	write_zsreg(channel, R12, regs[R12]);
-	write_zsreg(channel, R13, regs[R13]);
-	write_zsreg(channel, R14, regs[R14]);
-	write_zsreg(channel, R15, regs[R15]);
-	write_zsreg(channel, R3, regs[R3]);
-	write_zsreg(channel, R5, regs[R5]);
-	return;
-}
-
-/* Sets or clears DTR/RTS on the requested line */
-static inline void zs_rtsdtr(struct dec_serial *info, int which, int set)
-{
-        unsigned long flags;
-
-	spin_lock_irqsave(&zs_lock, flags);
-	if (info->zs_channel != info->zs_chan_a) {
-		if (set) {
-			info->zs_chan_a->curregs[5] |= (which & (RTS | DTR));
-		} else {
-			info->zs_chan_a->curregs[5] &= ~(which & (RTS | DTR));
-		}
-		write_zsreg(info->zs_chan_a, 5, info->zs_chan_a->curregs[5]);
-	}
-	spin_unlock_irqrestore(&zs_lock, flags);
-}
-
-/* Utility routines for the Zilog */
-static inline int get_zsbaud(struct dec_serial *ss)
-{
-	struct dec_zschannel *channel = ss->zs_channel;
-	int brg;
-
-	/* The baud rate is split up between two 8-bit registers in
-	 * what is termed 'BRG time constant' format in my docs for
-	 * the chip, it is a function of the clk rate the chip is
-	 * receiving which happens to be constant.
-	 */
-	brg = (read_zsreg(channel, 13) << 8);
-	brg |= read_zsreg(channel, 12);
-	return BRG_TO_BPS(brg, (zs_parms->clock/(ss->clk_divisor)));
-}
-
-/* On receive, this clears errors and the receiver interrupts */
-static inline void rs_recv_clear(struct dec_zschannel *zsc)
-{
-	write_zsreg(zsc, 0, ERR_RES);
-	write_zsreg(zsc, 0, RES_H_IUS); /* XXX this is unnecessary */
-}
-
-/*
- * ----------------------------------------------------------------------
- *
- * Here starts the interrupt handling routines.  All of the following
- * subroutines are declared as inline and are folded into
- * rs_interrupt().  They were separated out for readability's sake.
- *
- * 				- Ted Ts'o (tytso@mit.edu), 7-Mar-93
- * -----------------------------------------------------------------------
- */
-
-/*
- * This routine is used by the interrupt handler to schedule
- * processing in the software interrupt portion of the driver.
- */
-static void rs_sched_event(struct dec_serial *info, int event)
-{
-	info->event |= 1 << event;
-	tasklet_schedule(&info->tlet);
-}
-
-static void receive_chars(struct dec_serial *info)
-{
-	struct tty_struct *tty = info->tty;
-	unsigned char ch, stat, flag;
-
-	while ((read_zsreg(info->zs_channel, R0) & Rx_CH_AV) != 0) {
-
-		stat = read_zsreg(info->zs_channel, R1);
-		ch = read_zsdata(info->zs_channel);
-
-		if (!tty && (!info->hook || !info->hook->rx_char))
-			continue;
-
-		flag = TTY_NORMAL;
-		if (info->tty_break) {
-			info->tty_break = 0;
-			flag = TTY_BREAK;
-			if (info->flags & ZILOG_SAK)
-				do_SAK(tty);
-			/* Ignore the null char got when BREAK is removed.  */
-			if (ch == 0)
-				continue;
-		} else {
-			if (stat & Rx_OVR) {
-				flag = TTY_OVERRUN;
-			} else if (stat & FRM_ERR) {
-				flag = TTY_FRAME;
-			} else if (stat & PAR_ERR) {
-				flag = TTY_PARITY;
-			}
-			if (flag != TTY_NORMAL)
-				/* reset the error indication */
-				write_zsreg(info->zs_channel, R0, ERR_RES);
-		}
-
-#if defined(CONFIG_SERIAL_DEC_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ) && \
-   !defined(MODULE)
-		if (break_pressed && info->line == zs_console.index) {
-			/* Ignore the null char got when BREAK is removed.  */
-			if (ch == 0)
-				continue;
-			if (time_before(jiffies, break_pressed + HZ * 5)) {
-				handle_sysrq(ch, NULL);
-				break_pressed = 0;
-				continue;
-			}
-			break_pressed = 0;
-		}
-#endif
-
-		if (info->hook && info->hook->rx_char) {
-			(*info->hook->rx_char)(ch, flag);
-			return;
-  		}
-
-		tty_insert_flip_char(tty, ch, flag);
-	}
-	if (tty)
-		tty_flip_buffer_push(tty);
-}
-
-static void transmit_chars(struct dec_serial *info)
-{
-	if ((read_zsreg(info->zs_channel, R0) & Tx_BUF_EMP) == 0)
-		return;
-	info->tx_active = 0;
-
-	if (info->x_char) {
-		/* Send next char */
-		write_zsdata(info->zs_channel, info->x_char);
-		info->x_char = 0;
-		info->tx_active = 1;
-		return;
-	}
-
-	if ((info->xmit_cnt <= 0) || (info->tty && info->tty->stopped)
-	    || info->tx_stopped) {
-		write_zsreg(info->zs_channel, R0, RES_Tx_P);
-		return;
-	}
-	/* Send char */
-	write_zsdata(info->zs_channel, info->xmit_buf[info->xmit_tail++]);
-	info->xmit_tail = info->xmit_tail & (SERIAL_XMIT_SIZE-1);
-	info->xmit_cnt--;
-	info->tx_active = 1;
-
-	if (info->xmit_cnt < WAKEUP_CHARS)
-		rs_sched_event(info, RS_EVENT_WRITE_WAKEUP);
-}
-
-static void status_handle(struct dec_serial *info)
-{
-	unsigned char stat;
-
-	/* Get status from Read Register 0 */
-	stat = read_zsreg(info->zs_channel, R0);
-
-	if ((stat & BRK_ABRT) && !(info->read_reg_zero & BRK_ABRT)) {
-#if defined(CONFIG_SERIAL_DEC_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ) && \
-   !defined(MODULE)
-		if (info->line == zs_console.index) {
-			if (!break_pressed)
-				break_pressed = jiffies;
-		} else
-#endif
-			info->tty_break = 1;
-	}
-
-	if (info->zs_channel != info->zs_chan_a) {
-
-		/* Check for DCD transitions */
-		if (info->tty && !C_CLOCAL(info->tty) &&
-		    ((stat ^ info->read_reg_zero) & DCD) != 0 ) {
-			if (stat & DCD) {
-				wake_up_interruptible(&info->open_wait);
-			} else {
-				tty_hangup(info->tty);
-			}
-		}
-
-		/* Check for CTS transitions */
-		if (info->tty && C_CRTSCTS(info->tty)) {
-			if ((stat & CTS) != 0) {
-				if (info->tx_stopped) {
-					info->tx_stopped = 0;
-					if (!info->tx_active)
-						transmit_chars(info);
-				}
-			} else {
-				info->tx_stopped = 1;
-			}
-		}
-
-	}
-
-	/* Clear status condition... */
-	write_zsreg(info->zs_channel, R0, RES_EXT_INT);
-	info->read_reg_zero = stat;
-}
-
-/*
- * This is the serial driver's generic interrupt routine
- */
-static irqreturn_t rs_interrupt(int irq, void *dev_id)
-{
-	struct dec_serial *info = (struct dec_serial *) dev_id;
-	irqreturn_t status = IRQ_NONE;
-	unsigned char zs_intreg;
-	int shift;
-
-	/* NOTE: The read register 3, which holds the irq status,
-	 *       does so for both channels on each chip.  Although
-	 *       the status value itself must be read from the A
-	 *       channel and is only valid when read from channel A.
-	 *       Yes... broken hardware...
-	 */
-#define CHAN_IRQMASK (CHBRxIP | CHBTxIP | CHBEXT)
-
-	if (info->zs_chan_a == info->zs_channel)
-		shift = 3;	/* Channel A */
-	else
-		shift = 0;	/* Channel B */
-
-	for (;;) {
-		zs_intreg = read_zsreg(info->zs_chan_a, R3) >> shift;
-		if ((zs_intreg & CHAN_IRQMASK) == 0)
-			break;
-
-		status = IRQ_HANDLED;
-
-		if (zs_intreg & CHBRxIP) {
-			receive_chars(info);
-		}
-		if (zs_intreg & CHBTxIP) {
-			transmit_chars(info);
-		}
-		if (zs_intreg & CHBEXT) {
-			status_handle(info);
-		}
-	}
-
-	/* Why do we need this ? */
-	write_zsreg(info->zs_channel, 0, RES_H_IUS);
-
-	return status;
-}
-
-#ifdef ZS_DEBUG_REGS
-void zs_dump (void) {
-	int i, j;
-	for (i = 0; i < zs_channels_found; i++) {
-		struct dec_zschannel *ch = &zs_channels[i];
-		if ((long)ch->control == UNI_IO_BASE+UNI_SCC1A_CTRL) {
-			for (j = 0; j < 15; j++) {
-				printk("W%d = 0x%x\t",
-				       j, (int)ch->curregs[j]);
-			}
-			for (j = 0; j < 15; j++) {
-				printk("R%d = 0x%x\t",
-				       j, (int)read_zsreg(ch,j));
-			}
-			printk("\n\n");
-		}
-	}
-}
-#endif
-
-/*
- * -------------------------------------------------------------------
- * Here ends the serial interrupt routines.
- * -------------------------------------------------------------------
- */
-
-/*
- * ------------------------------------------------------------
- * rs_stop() and rs_start()
- *
- * This routines are called before setting or resetting tty->stopped.
- * ------------------------------------------------------------
- */
-static void rs_stop(struct tty_struct *tty)
-{
-	struct dec_serial *info = (struct dec_serial *)tty->driver_data;
-	unsigned long flags;
-
-	if (serial_paranoia_check(info, tty->name, "rs_stop"))
-		return;
-
-#if 1
-	spin_lock_irqsave(&zs_lock, flags);
-	if (info->zs_channel->curregs[5] & TxENAB) {
-		info->zs_channel->curregs[5] &= ~TxENAB;
-		write_zsreg(info->zs_channel, 5, info->zs_channel->curregs[5]);
-	}
-	spin_unlock_irqrestore(&zs_lock, flags);
-#endif
-}
-
-static void rs_start(struct tty_struct *tty)
-{
-	struct dec_serial *info = (struct dec_serial *)tty->driver_data;
-	unsigned long flags;
-
-	if (serial_paranoia_check(info, tty->name, "rs_start"))
-		return;
-
-	spin_lock_irqsave(&zs_lock, flags);
-#if 1
-	if (info->xmit_cnt && info->xmit_buf && !(info->zs_channel->curregs[5] & TxENAB)) {
-		info->zs_channel->curregs[5] |= TxENAB;
-		write_zsreg(info->zs_channel, 5, info->zs_channel->curregs[5]);
-	}
-#else
-	if (info->xmit_cnt && info->xmit_buf && !info->tx_active) {
-		transmit_chars(info);
-	}
-#endif
-	spin_unlock_irqrestore(&zs_lock, flags);
-}
-
-/*
- * This routine is used to handle the "bottom half" processing for the
- * serial driver, known also the "software interrupt" processing.
- * This processing is done at the kernel interrupt level, after the
- * rs_interrupt() has returned, BUT WITH INTERRUPTS TURNED ON.  This
- * is where time-consuming activities which can not be done in the
- * interrupt driver proper are done; the interrupt driver schedules
- * them using rs_sched_event(), and they get done here.
- */
-
-static void do_softint(unsigned long private_)
-{
-	struct dec_serial	*info = (struct dec_serial *) private_;
-	struct tty_struct	*tty;
-
-	tty = info->tty;
-	if (!tty)
-		return;
-
-	if (test_and_clear_bit(RS_EVENT_WRITE_WAKEUP, &info->event))
-		tty_wakeup(tty);
-}
-
-static int zs_startup(struct dec_serial * info)
-{
-	unsigned long flags;
-
-	if (info->flags & ZILOG_INITIALIZED)
-		return 0;
-
-	if (!info->xmit_buf) {
-		info->xmit_buf = (unsigned char *) get_zeroed_page(GFP_KERNEL);
-		if (!info->xmit_buf)
-			return -ENOMEM;
-	}
-
-	spin_lock_irqsave(&zs_lock, flags);
-
-#ifdef SERIAL_DEBUG_OPEN
-	printk("starting up ttyS%d (irq %d)...", info->line, info->irq);
-#endif
-
-	/*
-	 * Clear the receive FIFO.
-	 */
-	ZS_CLEARFIFO(info->zs_channel);
-	info->xmit_fifo_size = 1;
-
-	/*
-	 * Clear the interrupt registers.
-	 */
-	write_zsreg(info->zs_channel, R0, ERR_RES);
-	write_zsreg(info->zs_channel, R0, RES_H_IUS);
-
-	/*
-	 * Set the speed of the serial port
-	 */
-	change_speed(info);
-
-	/*
-	 * Turn on RTS and DTR.
-	 */
-	zs_rtsdtr(info, RTS | DTR, 1);
-
-	/*
-	 * Finally, enable sequencing and interrupts
-	 */
-	info->zs_channel->curregs[R1] &= ~RxINT_MASK;
-	info->zs_channel->curregs[R1] |= (RxINT_ALL | TxINT_ENAB |
-					  EXT_INT_ENAB);
-	info->zs_channel->curregs[R3] |= RxENABLE;
-	info->zs_channel->curregs[R5] |= TxENAB;
-	info->zs_channel->curregs[R15] |= (DCDIE | CTSIE | TxUIE | BRKIE);
-	write_zsreg(info->zs_channel, R1, info->zs_channel->curregs[R1]);
-	write_zsreg(info->zs_channel, R3, info->zs_channel->curregs[R3]);
-	write_zsreg(info->zs_channel, R5, info->zs_channel->curregs[R5]);
-	write_zsreg(info->zs_channel, R15, info->zs_channel->curregs[R15]);
-
-	/*
-	 * And clear the interrupt registers again for luck.
-	 */
-	write_zsreg(info->zs_channel, R0, ERR_RES);
-	write_zsreg(info->zs_channel, R0, RES_H_IUS);
-
-	/* Save the current value of RR0 */
-	info->read_reg_zero = read_zsreg(info->zs_channel, R0);
-
-	if (info->tty)
-		clear_bit(TTY_IO_ERROR, &info->tty->flags);
-	info->xmit_cnt = info->xmit_head = info->xmit_tail = 0;
-
-	info->flags |= ZILOG_INITIALIZED;
-	spin_unlock_irqrestore(&zs_lock, flags);
-	return 0;
-}
-
-/*
- * This routine will shutdown a serial port; interrupts are disabled, and
- * DTR is dropped if the hangup on close termio flag is on.
- */
-static void shutdown(struct dec_serial * info)
-{
-	unsigned long	flags;
-
-	if (!(info->flags & ZILOG_INITIALIZED))
-		return;
-
-#ifdef SERIAL_DEBUG_OPEN
-	printk("Shutting down serial port %d (irq %d)....", info->line,
-	       info->irq);
-#endif
-
-	spin_lock_irqsave(&zs_lock, flags);
-
-	if (info->xmit_buf) {
-		free_page((unsigned long) info->xmit_buf);
-		info->xmit_buf = 0;
-	}
-
-	info->zs_channel->curregs[1] = 0;
-	write_zsreg(info->zs_channel, 1, info->zs_channel->curregs[1]);	/* no interrupts */
-
-	info->zs_channel->curregs[3] &= ~RxENABLE;
-	write_zsreg(info->zs_channel, 3, info->zs_channel->curregs[3]);
-
-	info->zs_channel->curregs[5] &= ~TxENAB;
-	write_zsreg(info->zs_channel, 5, info->zs_channel->curregs[5]);
-	if (!info->tty || C_HUPCL(info->tty)) {
-		zs_rtsdtr(info, RTS | DTR, 0);
-	}
-
-	if (info->tty)
-		set_bit(TTY_IO_ERROR, &info->tty->flags);
-
-	info->flags &= ~ZILOG_INITIALIZED;
-	spin_unlock_irqrestore(&zs_lock, flags);
-}
-
-/*
- * This routine is called to set the UART divisor registers to match
- * the specified baud rate for a serial port.
- */
-static void change_speed(struct dec_serial *info)
-{
-	unsigned cflag;
-	int	i;
-	int	brg, bits;
-	unsigned long flags;
-
-	if (!info->hook) {
-		if (!info->tty || !info->tty->termios)
-			return;
-		cflag = info->tty->termios->c_cflag;
-		if (!info->port)
-			return;
-	} else {
-		cflag = info->hook->cflags;
-	}
-
-	i = cflag & CBAUD;
-	if (i & CBAUDEX) {
-		i &= ~CBAUDEX;
-		if (i < 1 || i > 2) {
-			if (!info->hook)
-				info->tty->termios->c_cflag &= ~CBAUDEX;
-			else
-				info->hook->cflags &= ~CBAUDEX;
-		} else
-			i += 15;
-	}
-
-	spin_lock_irqsave(&zs_lock, flags);
-	info->zs_baud = baud_table[i];
-	if (info->zs_baud) {
-		brg = BPS_TO_BRG(info->zs_baud, zs_parms->clock/info->clk_divisor);
-		info->zs_channel->curregs[12] = (brg & 255);
-		info->zs_channel->curregs[13] = ((brg >> 8) & 255);
-		zs_rtsdtr(info, DTR, 1);
-	} else {
-		zs_rtsdtr(info, RTS | DTR, 0);
-		return;
-	}
-
-	/* byte size and parity */
-	info->zs_channel->curregs[3] &= ~RxNBITS_MASK;
-	info->zs_channel->curregs[5] &= ~TxNBITS_MASK;
-	switch (cflag & CSIZE) {
-	case CS5:
-		bits = 7;
-		info->zs_channel->curregs[3] |= Rx5;
-		info->zs_channel->curregs[5] |= Tx5;
-		break;
-	case CS6:
-		bits = 8;
-		info->zs_channel->curregs[3] |= Rx6;
-		info->zs_channel->curregs[5] |= Tx6;
-		break;
-	case CS7:
-		bits = 9;
-		info->zs_channel->curregs[3] |= Rx7;
-		info->zs_channel->curregs[5] |= Tx7;
-		break;
-	case CS8:
-	default: /* defaults to 8 bits */
-		bits = 10;
-		info->zs_channel->curregs[3] |= Rx8;
-		info->zs_channel->curregs[5] |= Tx8;
-		break;
-	}
-
-	info->timeout = ((info->xmit_fifo_size*HZ*bits) / info->zs_baud);
-        info->timeout += HZ/50;         /* Add .02 seconds of slop */
-
-	info->zs_channel->curregs[4] &= ~(SB_MASK | PAR_ENA | PAR_EVEN);
-	if (cflag & CSTOPB) {
-		info->zs_channel->curregs[4] |= SB2;
-	} else {
-		info->zs_channel->curregs[4] |= SB1;
-	}
-	if (cflag & PARENB) {
-		info->zs_channel->curregs[4] |= PAR_ENA;
-	}
-	if (!(cflag & PARODD)) {
-		info->zs_channel->curregs[4] |= PAR_EVEN;
-	}
-
-	if (!(cflag & CLOCAL)) {
-		if (!(info->zs_channel->curregs[15] & DCDIE))
-			info->read_reg_zero = read_zsreg(info->zs_channel, 0);
-		info->zs_channel->curregs[15] |= DCDIE;
-	} else
-		info->zs_channel->curregs[15] &= ~DCDIE;
-	if (cflag & CRTSCTS) {
-		info->zs_channel->curregs[15] |= CTSIE;
-		if ((read_zsreg(info->zs_channel, 0) & CTS) == 0)
-			info->tx_stopped = 1;
-	} else {
-		info->zs_channel->curregs[15] &= ~CTSIE;
-		info->tx_stopped = 0;
-	}
-
-	/* Load up the new values */
-	load_zsregs(info->zs_channel, info->zs_channel->curregs);
-
-	spin_unlock_irqrestore(&zs_lock, flags);
-}
-
-static void rs_flush_chars(struct tty_struct *tty)
-{
-	struct dec_serial *info = (struct dec_serial *)tty->driver_data;
-	unsigned long flags;
-
-	if (serial_paranoia_check(info, tty->name, "rs_flush_chars"))
-		return;
-
-	if (info->xmit_cnt <= 0 || tty->stopped || info->tx_stopped ||
-	    !info->xmit_buf)
-		return;
-
-	/* Enable transmitter */
-	spin_lock_irqsave(&zs_lock, flags);
-	transmit_chars(info);
-	spin_unlock_irqrestore(&zs_lock, flags);
-}
-
-static int rs_write(struct tty_struct * tty,
-		    const unsigned char *buf, int count)
-{
-	int	c, total = 0;
-	struct dec_serial *info = (struct dec_serial *)tty->driver_data;
-	unsigned long flags;
-
-	if (serial_paranoia_check(info, tty->name, "rs_write"))
-		return 0;
-
-	if (!tty || !info->xmit_buf)
-		return 0;
-
-	while (1) {
-		spin_lock_irqsave(&zs_lock, flags);
-		c = min(count, min(SERIAL_XMIT_SIZE - info->xmit_cnt - 1,
-				   SERIAL_XMIT_SIZE - info->xmit_head));
-		if (c <= 0)
-			break;
-
-		memcpy(info->xmit_buf + info->xmit_head, buf, c);
-		info->xmit_head = (info->xmit_head + c) & (SERIAL_XMIT_SIZE-1);
-		info->xmit_cnt += c;
-		spin_unlock_irqrestore(&zs_lock, flags);
-		buf += c;
-		count -= c;
-		total += c;
-	}
-
-	if (info->xmit_cnt && !tty->stopped && !info->tx_stopped
-	    && !info->tx_active)
-		transmit_chars(info);
-	spin_unlock_irqrestore(&zs_lock, flags);
-	return total;
-}
-
-static int rs_write_room(struct tty_struct *tty)
-{
-	struct dec_serial *info = (struct dec_serial *)tty->driver_data;
-	int	ret;
-
-	if (serial_paranoia_check(info, tty->name, "rs_write_room"))
-		return 0;
-	ret = SERIAL_XMIT_SIZE - info->xmit_cnt - 1;
-	if (ret < 0)
-		ret = 0;
-	return ret;
-}
-
-static int rs_chars_in_buffer(struct tty_struct *tty)
-{
-	struct dec_serial *info = (struct dec_serial *)tty->driver_data;
-
-	if (serial_paranoia_check(info, tty->name, "rs_chars_in_buffer"))
-		return 0;
-	return info->xmit_cnt;
-}
-
-static void rs_flush_buffer(struct tty_struct *tty)
-{
-	struct dec_serial *info = (struct dec_serial *)tty->driver_data;
-
-	if (serial_paranoia_check(info, tty->name, "rs_flush_buffer"))
-		return;
-	spin_lock_irq(&zs_lock);
-	info->xmit_cnt = info->xmit_head = info->xmit_tail = 0;
-	spin_unlock_irq(&zs_lock);
-	tty_wakeup(tty);
-}
-
-/*
- * ------------------------------------------------------------
- * rs_throttle()
- *
- * This routine is called by the upper-layer tty layer to signal that
- * incoming characters should be throttled.
- * ------------------------------------------------------------
- */
-static void rs_throttle(struct tty_struct * tty)
-{
-	struct dec_serial *info = (struct dec_serial *)tty->driver_data;
-	unsigned long flags;
-
-#ifdef SERIAL_DEBUG_THROTTLE
-	char	buf[64];
-
-	printk("throttle %s: %d....\n", _tty_name(tty, buf),
-	       tty->ldisc.chars_in_buffer(tty));
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "rs_throttle"))
-		return;
-
-	if (I_IXOFF(tty)) {
-		spin_lock_irqsave(&zs_lock, flags);
-		info->x_char = STOP_CHAR(tty);
-		if (!info->tx_active)
-			transmit_chars(info);
-		spin_unlock_irqrestore(&zs_lock, flags);
-	}
-
-	if (C_CRTSCTS(tty)) {
-		zs_rtsdtr(info, RTS, 0);
-	}
-}
-
-static void rs_unthrottle(struct tty_struct * tty)
-{
-	struct dec_serial *info = (struct dec_serial *)tty->driver_data;
-	unsigned long flags;
-
-#ifdef SERIAL_DEBUG_THROTTLE
-	char	buf[64];
-
-	printk("unthrottle %s: %d....\n", _tty_name(tty, buf),
-	       tty->ldisc.chars_in_buffer(tty));
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "rs_unthrottle"))
-		return;
-
-	if (I_IXOFF(tty)) {
-		spin_lock_irqsave(&zs_lock, flags);
-		if (info->x_char)
-			info->x_char = 0;
-		else {
-			info->x_char = START_CHAR(tty);
-			if (!info->tx_active)
-				transmit_chars(info);
-		}
-		spin_unlock_irqrestore(&zs_lock, flags);
-	}
-
-	if (C_CRTSCTS(tty)) {
-		zs_rtsdtr(info, RTS, 1);
-	}
-}
-
-/*
- * ------------------------------------------------------------
- * rs_ioctl() and friends
- * ------------------------------------------------------------
- */
-
-static int get_serial_info(struct dec_serial * info,
-			   struct serial_struct * retinfo)
-{
-	struct serial_struct tmp;
-
-	if (!retinfo)
-		return -EFAULT;
-	memset(&tmp, 0, sizeof(tmp));
-	tmp.type = info->type;
-	tmp.line = info->line;
-	tmp.port = info->port;
-	tmp.irq = info->irq;
-	tmp.flags = info->flags;
-	tmp.baud_base = info->baud_base;
-	tmp.close_delay = info->close_delay;
-	tmp.closing_wait = info->closing_wait;
-	tmp.custom_divisor = info->custom_divisor;
-	return copy_to_user(retinfo,&tmp,sizeof(*retinfo)) ? -EFAULT : 0;
-}
-
-static int set_serial_info(struct dec_serial * info,
-			   struct serial_struct * new_info)
-{
-	struct serial_struct new_serial;
-	struct dec_serial old_info;
-	int 			retval = 0;
-
-	if (!new_info)
-		return -EFAULT;
-	copy_from_user(&new_serial,new_info,sizeof(new_serial));
-	old_info = *info;
-
-	if (!capable(CAP_SYS_ADMIN)) {
-		if ((new_serial.baud_base != info->baud_base) ||
-		    (new_serial.type != info->type) ||
-		    (new_serial.close_delay != info->close_delay) ||
-		    ((new_serial.flags & ~ZILOG_USR_MASK) !=
-		     (info->flags & ~ZILOG_USR_MASK)))
-			return -EPERM;
-		info->flags = ((info->flags & ~ZILOG_USR_MASK) |
-			       (new_serial.flags & ZILOG_USR_MASK));
-		info->custom_divisor = new_serial.custom_divisor;
-		goto check_and_exit;
-	}
-
-	if (info->count > 1)
-		return -EBUSY;
-
-	/*
-	 * OK, past this point, all the error checking has been done.
-	 * At this point, we start making changes.....
-	 */
-
-	info->baud_base = new_serial.baud_base;
-	info->flags = ((info->flags & ~ZILOG_FLAGS) |
-			(new_serial.flags & ZILOG_FLAGS));
-	info->type = new_serial.type;
-	info->close_delay = new_serial.close_delay;
-	info->closing_wait = new_serial.closing_wait;
-
-check_and_exit:
-	retval = zs_startup(info);
-	return retval;
-}
-
-/*
- * get_lsr_info - get line status register info
- *
- * Purpose: Let user call ioctl() to get info when the UART physically
- * 	    is emptied.  On bus types like RS485, the transmitter must
- * 	    release the bus after transmitting. This must be done when
- * 	    the transmit shift register is empty, not be done when the
- * 	    transmit holding register is empty.  This functionality
- * 	    allows an RS485 driver to be written in user space.
- */
-static int get_lsr_info(struct dec_serial * info, unsigned int *value)
-{
-	unsigned char status;
-
-	spin_lock(&zs_lock);
-	status = read_zsreg(info->zs_channel, 0);
-	spin_unlock_irq(&zs_lock);
-	put_user(status,value);
-	return 0;
-}
-
-static int rs_tiocmget(struct tty_struct *tty, struct file *file)
-{
-	struct dec_serial * info = (struct dec_serial *)tty->driver_data;
-	unsigned char control, status_a, status_b;
-	unsigned int result;
-
-	if (info->hook)
-		return -ENODEV;
-
-	if (serial_paranoia_check(info, tty->name, __FUNCTION__))
-		return -ENODEV;
-
-	if (tty->flags & (1 << TTY_IO_ERROR))
-		return -EIO;
-
-	if (info->zs_channel == info->zs_chan_a)
-		result = 0;
-	else {
-		spin_lock(&zs_lock);
-		control = info->zs_chan_a->curregs[5];
-		status_a = read_zsreg(info->zs_chan_a, 0);
-		status_b = read_zsreg(info->zs_channel, 0);
-		spin_unlock_irq(&zs_lock);
-		result =  ((control  & RTS) ? TIOCM_RTS: 0)
-			| ((control  & DTR) ? TIOCM_DTR: 0)
-			| ((status_b & DCD) ? TIOCM_CAR: 0)
-			| ((status_a & DCD) ? TIOCM_RNG: 0)
-			| ((status_a & SYNC_HUNT) ? TIOCM_DSR: 0)
-			| ((status_b & CTS) ? TIOCM_CTS: 0);
-	}
-	return result;
-}
-
-static int rs_tiocmset(struct tty_struct *tty, struct file *file,
-                       unsigned int set, unsigned int clear)
-{
-	struct dec_serial * info = (struct dec_serial *)tty->driver_data;
-
-	if (info->hook)
-		return -ENODEV;
-
-	if (serial_paranoia_check(info, tty->name, __FUNCTION__))
-		return -ENODEV;
-
-	if (tty->flags & (1 << TTY_IO_ERROR))
-		return -EIO;
-
-	if (info->zs_channel == info->zs_chan_a)
-		return 0;
-
-	spin_lock(&zs_lock);
-	if (set & TIOCM_RTS)
-		info->zs_chan_a->curregs[5] |= RTS;
-	if (set & TIOCM_DTR)
-		info->zs_chan_a->curregs[5] |= DTR;
-	if (clear & TIOCM_RTS)
-		info->zs_chan_a->curregs[5] &= ~RTS;
-	if (clear & TIOCM_DTR)
-		info->zs_chan_a->curregs[5] &= ~DTR;
-	write_zsreg(info->zs_chan_a, 5, info->zs_chan_a->curregs[5]);
-	spin_unlock_irq(&zs_lock);
-	return 0;
-}
-
-/*
- * rs_break - turn transmit break condition on/off
- */
-static void rs_break(struct tty_struct *tty, int break_state)
-{
-	struct dec_serial *info = (struct dec_serial *) tty->driver_data;
-	unsigned long flags;
-
-	if (serial_paranoia_check(info, tty->name, "rs_break"))
-		return;
-	if (!info->port)
-		return;
-
-	spin_lock_irqsave(&zs_lock, flags);
-	if (break_state == -1)
-		info->zs_channel->curregs[5] |= SND_BRK;
-	else
-		info->zs_channel->curregs[5] &= ~SND_BRK;
-	write_zsreg(info->zs_channel, 5, info->zs_channel->curregs[5]);
-	spin_unlock_irqrestore(&zs_lock, flags);
-}
-
-static int rs_ioctl(struct tty_struct *tty, struct file * file,
-		    unsigned int cmd, unsigned long arg)
-{
-	struct dec_serial * info = (struct dec_serial *)tty->driver_data;
-
-	if (info->hook)
-		return -ENODEV;
-
-	if (serial_paranoia_check(info, tty->name, "rs_ioctl"))
-		return -ENODEV;
-
-	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
-	    (cmd != TIOCSERCONFIG) && (cmd != TIOCSERGWILD)  &&
-	    (cmd != TIOCSERSWILD) && (cmd != TIOCSERGSTRUCT)) {
-		if (tty->flags & (1 << TTY_IO_ERROR))
-		    return -EIO;
-	}
-
-	switch (cmd) {
-	case TIOCGSERIAL:
-		if (!access_ok(VERIFY_WRITE, (void *)arg,
-			       sizeof(struct serial_struct)))
-			return -EFAULT;
-		return get_serial_info(info, (struct serial_struct *)arg);
-
-	case TIOCSSERIAL:
-		return set_serial_info(info, (struct serial_struct *)arg);
-
-	case TIOCSERGETLSR:			/* Get line status register */
-		if (!access_ok(VERIFY_WRITE, (void *)arg,
-			       sizeof(unsigned int)))
-			return -EFAULT;
-		return get_lsr_info(info, (unsigned int *)arg);
-
-	case TIOCSERGSTRUCT:
-		if (!access_ok(VERIFY_WRITE, (void *)arg,
-			       sizeof(struct dec_serial)))
-			return -EFAULT;
-		copy_from_user((struct dec_serial *)arg, info,
-			       sizeof(struct dec_serial));
-		return 0;
-
-	default:
-		return -ENOIOCTLCMD;
-	}
-	return 0;
-}
-
-static void rs_set_termios(struct tty_struct *tty, struct ktermios *old_termios)
-{
-	struct dec_serial *info = (struct dec_serial *)tty->driver_data;
-	int was_stopped;
-
-	if (tty->termios->c_cflag == old_termios->c_cflag)
-		return;
-	was_stopped = info->tx_stopped;
-
-	change_speed(info);
-
-	if (was_stopped && !info->tx_stopped)
-		rs_start(tty);
-}
-
-/*
- * ------------------------------------------------------------
- * rs_close()
- *
- * This routine is called when the serial port gets closed.
- * Wait for the last remaining data to be sent.
- * ------------------------------------------------------------
- */
-static void rs_close(struct tty_struct *tty, struct file * filp)
-{
-	struct dec_serial * info = (struct dec_serial *)tty->driver_data;
-	unsigned long flags;
-
-	if (!info || serial_paranoia_check(info, tty->name, "rs_close"))
-		return;
-
-	spin_lock_irqsave(&zs_lock, flags);
-
-	if (tty_hung_up_p(filp)) {
-		spin_unlock_irqrestore(&zs_lock, flags);
-		return;
-	}
-
-#ifdef SERIAL_DEBUG_OPEN
-	printk("rs_close ttyS%d, count = %d\n", info->line, info->count);
-#endif
-	if ((tty->count == 1) && (info->count != 1)) {
-		/*
-		 * Uh, oh.  tty->count is 1, which means that the tty
-		 * structure will be freed.  Info->count should always
-		 * be one in these conditions.  If it's greater than
-		 * one, we've got real problems, since it means the
-		 * serial port won't be shutdown.
-		 */
-		printk("rs_close: bad serial port count; tty->count is 1, "
-		       "info->count is %d\n", info->count);
-		info->count = 1;
-	}
-	if (--info->count < 0) {
-		printk("rs_close: bad serial port count for ttyS%d: %d\n",
-		       info->line, info->count);
-		info->count = 0;
-	}
-	if (info->count) {
-		spin_unlock_irqrestore(&zs_lock, flags);
-		return;
-	}
-	info->flags |= ZILOG_CLOSING;
-	/*
-	 * Now we wait for the transmit buffer to clear; and we notify
-	 * the line discipline to only process XON/XOFF characters.
-	 */
-	tty->closing = 1;
-	if (info->closing_wait != ZILOG_CLOSING_WAIT_NONE)
-		tty_wait_until_sent(tty, info->closing_wait);
-	/*
-	 * At this point we stop accepting input.  To do this, we
-	 * disable the receiver and receive interrupts.
-	 */
-	info->zs_channel->curregs[3] &= ~RxENABLE;
-	write_zsreg(info->zs_channel, 3, info->zs_channel->curregs[3]);
-	info->zs_channel->curregs[1] = 0;	/* disable any rx ints */
-	write_zsreg(info->zs_channel, 1, info->zs_channel->curregs[1]);
-	ZS_CLEARFIFO(info->zs_channel);
-	if (info->flags & ZILOG_INITIALIZED) {
-		/*
-		 * Before we drop DTR, make sure the SCC transmitter
-		 * has completely drained.
-		 */
-		rs_wait_until_sent(tty, info->timeout);
-	}
-
-	shutdown(info);
-	if (tty->driver->flush_buffer)
-		tty->driver->flush_buffer(tty);
-	tty_ldisc_flush(tty);
-	tty->closing = 0;
-	info->event = 0;
-	info->tty = 0;
-	if (info->blocked_open) {
-		if (info->close_delay) {
-			msleep_interruptible(jiffies_to_msecs(info->close_delay));
-		}
-		wake_up_interruptible(&info->open_wait);
-	}
-	info->flags &= ~(ZILOG_NORMAL_ACTIVE|ZILOG_CLOSING);
-	wake_up_interruptible(&info->close_wait);
-	spin_unlock_irqrestore(&zs_lock, flags);
-}
-
-/*
- * rs_wait_until_sent() --- wait until the transmitter is empty
- */
-static void rs_wait_until_sent(struct tty_struct *tty, int timeout)
-{
-	struct dec_serial *info = (struct dec_serial *) tty->driver_data;
-	unsigned long orig_jiffies;
-	int char_time;
-
-	if (serial_paranoia_check(info, tty->name, "rs_wait_until_sent"))
-		return;
-
-	orig_jiffies = jiffies;
-	/*
-	 * Set the check interval to be 1/5 of the estimated time to
-	 * send a single character, and make it at least 1.  The check
-	 * interval should also be less than the timeout.
-	 */
-	char_time = (info->timeout - HZ/50) / info->xmit_fifo_size;
-	char_time = char_time / 5;
-	if (char_time == 0)
-		char_time = 1;
-	if (timeout)
-		char_time = min(char_time, timeout);
-	while ((read_zsreg(info->zs_channel, 1) & Tx_BUF_EMP) == 0) {
-		msleep_interruptible(jiffies_to_msecs(char_time));
-		if (signal_pending(current))
-			break;
-		if (timeout && time_after(jiffies, orig_jiffies + timeout))
-			break;
-	}
-	current->state = TASK_RUNNING;
-}
-
-/*
- * rs_hangup() --- called by tty_hangup() when a hangup is signaled.
- */
-static void rs_hangup(struct tty_struct *tty)
-{
-	struct dec_serial * info = (struct dec_serial *)tty->driver_data;
-
-	if (serial_paranoia_check(info, tty->name, "rs_hangup"))
-		return;
-
-	rs_flush_buffer(tty);
-	shutdown(info);
-	info->event = 0;
-	info->count = 0;
-	info->flags &= ~ZILOG_NORMAL_ACTIVE;
-	info->tty = 0;
-	wake_up_interruptible(&info->open_wait);
-}
-
-/*
- * ------------------------------------------------------------
- * rs_open() and friends
- * ------------------------------------------------------------
- */
-static int block_til_ready(struct tty_struct *tty, struct file * filp,
-			   struct dec_serial *info)
-{
-	DECLARE_WAITQUEUE(wait, current);
-	int		retval;
-	int		do_clocal = 0;
-
-	/*
-	 * If the device is in the middle of being closed, then block
-	 * until it's done, and then try again.
-	 */
-	if (info->flags & ZILOG_CLOSING) {
-		interruptible_sleep_on(&info->close_wait);
-#ifdef SERIAL_DO_RESTART
-		return ((info->flags & ZILOG_HUP_NOTIFY) ?
-			-EAGAIN : -ERESTARTSYS);
-#else
-		return -EAGAIN;
-#endif
-	}
-
-	/*
-	 * If non-blocking mode is set, or the port is not enabled,
-	 * then make the check up front and then exit.
-	 */
-	if ((filp->f_flags & O_NONBLOCK) ||
-	    (tty->flags & (1 << TTY_IO_ERROR))) {
-		info->flags |= ZILOG_NORMAL_ACTIVE;
-		return 0;
-	}
-
-	if (tty->termios->c_cflag & CLOCAL)
-		do_clocal = 1;
-
-	/*
-	 * Block waiting for the carrier detect and the line to become
-	 * free (i.e., not in use by the callout).  While we are in
-	 * this loop, info->count is dropped by one, so that
-	 * rs_close() knows when to free things.  We restore it upon
-	 * exit, either normal or abnormal.
-	 */
-	retval = 0;
-	add_wait_queue(&info->open_wait, &wait);
-#ifdef SERIAL_DEBUG_OPEN
-	printk("block_til_ready before block: ttyS%d, count = %d\n",
-	       info->line, info->count);
-#endif
-	spin_lock(&zs_lock);
-	if (!tty_hung_up_p(filp))
-		info->count--;
-	spin_unlock_irq(&zs_lock);
-	info->blocked_open++;
-	while (1) {
-		spin_lock(&zs_lock);
-		if (tty->termios->c_cflag & CBAUD)
-			zs_rtsdtr(info, RTS | DTR, 1);
-		spin_unlock_irq(&zs_lock);
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (tty_hung_up_p(filp) ||
-		    !(info->flags & ZILOG_INITIALIZED)) {
-#ifdef SERIAL_DO_RESTART
-			if (info->flags & ZILOG_HUP_NOTIFY)
-				retval = -EAGAIN;
-			else
-				retval = -ERESTARTSYS;
-#else
-			retval = -EAGAIN;
-#endif
-			break;
-		}
-		if (!(info->flags & ZILOG_CLOSING) &&
-		    (do_clocal || (read_zsreg(info->zs_channel, 0) & DCD)))
-			break;
-		if (signal_pending(current)) {
-			retval = -ERESTARTSYS;
-			break;
-		}
-#ifdef SERIAL_DEBUG_OPEN
-		printk("block_til_ready blocking: ttyS%d, count = %d\n",
-		       info->line, info->count);
-#endif
-		schedule();
-	}
-	current->state = TASK_RUNNING;
-	remove_wait_queue(&info->open_wait, &wait);
-	if (!tty_hung_up_p(filp))
-		info->count++;
-	info->blocked_open--;
-#ifdef SERIAL_DEBUG_OPEN
-	printk("block_til_ready after blocking: ttyS%d, count = %d\n",
-	       info->line, info->count);
-#endif
-	if (retval)
-		return retval;
-	info->flags |= ZILOG_NORMAL_ACTIVE;
-	return 0;
-}
-
-/*
- * This routine is called whenever a serial port is opened.  It
- * enables interrupts for a serial port, linking in its ZILOG structure into
- * the IRQ chain.   It also performs the serial-specific
- * initialization for the tty structure.
- */
-static int rs_open(struct tty_struct *tty, struct file * filp)
-{
-	struct dec_serial	*info;
-	int 			retval, line;
-
-	line = tty->index;
-	if ((line < 0) || (line >= zs_channels_found))
-		return -ENODEV;
-	info = zs_soft + line;
-
-	if (info->hook)
-		return -ENODEV;
-
-	if (serial_paranoia_check(info, tty->name, "rs_open"))
-		return -ENODEV;
-#ifdef SERIAL_DEBUG_OPEN
-	printk("rs_open %s, count = %d\n", tty->name, info->count);
-#endif
-
-	info->count++;
-	tty->driver_data = info;
-	info->tty = tty;
-
-	/*
-	 * If the port is the middle of closing, bail out now
-	 */
-	if (tty_hung_up_p(filp) ||
-	    (info->flags & ZILOG_CLOSING)) {
-		if (info->flags & ZILOG_CLOSING)
-			interruptible_sleep_on(&info->close_wait);
-#ifdef SERIAL_DO_RESTART
-		return ((info->flags & ZILOG_HUP_NOTIFY) ?
-			-EAGAIN : -ERESTARTSYS);
-#else
-		return -EAGAIN;
-#endif
-	}
-
-	/*
-	 * Start up serial port
-	 */
-	retval = zs_startup(info);
-	if (retval)
-		return retval;
-
-	retval = block_til_ready(tty, filp, info);
-	if (retval) {
-#ifdef SERIAL_DEBUG_OPEN
-		printk("rs_open returning after block_til_ready with %d\n",
-		       retval);
-#endif
-		return retval;
-	}
-
-#ifdef CONFIG_SERIAL_DEC_CONSOLE
-	if (zs_console.cflag && zs_console.index == line) {
-		tty->termios->c_cflag = zs_console.cflag;
-		zs_console.cflag = 0;
-		change_speed(info);
-	}
-#endif
-
-#ifdef SERIAL_DEBUG_OPEN
-	printk("rs_open %s successful...", tty->name);
-#endif
-/* tty->low_latency = 1; */
-	return 0;
-}
-
-/* Finally, routines used to initialize the serial driver. */
-
-static void __init show_serial_version(void)
-{
-	printk("DECstation Z8530 serial driver version 0.09\n");
-}
-
-/*  Initialize Z8530s zs_channels
- */
-
-static void __init probe_sccs(void)
-{
-	struct dec_serial **pp;
-	int i, n, n_chips = 0, n_channels, chip, channel;
-	unsigned long flags;
-
-	/*
-	 * did we get here by accident?
-	 */
-	if(!BUS_PRESENT) {
-		printk("Not on JUNKIO machine, skipping probe_sccs\n");
-		return;
-	}
-
-	switch(mips_machtype) {
-#ifdef CONFIG_MACH_DECSTATION
-	case MACH_DS5000_2X0:
-	case MACH_DS5900:
-		n_chips = 2;
-		zs_parms = &ds_parms;
-		zs_parms->irq0 = dec_interrupt[DEC_IRQ_SCC0];
-		zs_parms->irq1 = dec_interrupt[DEC_IRQ_SCC1];
-		break;
-	case MACH_DS5000_1XX:
-		n_chips = 2;
-		zs_parms = &ds_parms;
-		zs_parms->irq0 = dec_interrupt[DEC_IRQ_SCC0];
-		zs_parms->irq1 = dec_interrupt[DEC_IRQ_SCC1];
-		break;
-	case MACH_DS5000_XX:
-		n_chips = 1;
-		zs_parms = &ds_parms;
-		zs_parms->irq0 = dec_interrupt[DEC_IRQ_SCC0];
-		break;
-#endif
-	default:
-		panic("zs: unsupported bus");
-	}
-	if (!zs_parms)
-		panic("zs: uninitialized parms");
-
-	pp = &zs_chain;
-
-	n_channels = 0;
-
-	for (chip = 0; chip < n_chips; chip++) {
-		for (channel = 0; channel <= 1; channel++) {
-			/*
-			 * The sccs reside on the high byte of the 16 bit IOBUS
-			 */
-			zs_channels[n_channels].control =
-				(volatile void *)CKSEG1ADDR(dec_kn_slot_base +
-			  (0 == chip ? zs_parms->scc0 : zs_parms->scc1) +
-			  (0 == channel ? zs_parms->channel_a_offset :
-			                  zs_parms->channel_b_offset));
-			zs_channels[n_channels].data =
-				zs_channels[n_channels].control + 4;
-
-#ifndef CONFIG_SERIAL_DEC_CONSOLE
-			/*
-			 * We're called early and memory managment isn't up, yet.
-			 * Thus request_region would fail.
-			 */
-			if (!request_region((unsigned long)
-					 zs_channels[n_channels].control,
-					 ZS_CHAN_IO_SIZE, "SCC"))
-				panic("SCC I/O region is not free");
-#endif
-			zs_soft[n_channels].zs_channel = &zs_channels[n_channels];
-			/* HACK alert! */
-			if (!(chip & 1))
-				zs_soft[n_channels].irq = zs_parms->irq0;
-			else
-				zs_soft[n_channels].irq = zs_parms->irq1;
-
-			/*
-			 *  Identification of channel A. Location of channel A
-                         *  inside chip depends on mapping of internal address
-			 *  the chip decodes channels by.
-			 *  CHANNEL_A_NR returns either 0 (in case of
-			 *  DECstations) or 1 (in case of Baget).
-			 */
-			if (CHANNEL_A_NR == channel)
-				zs_soft[n_channels].zs_chan_a =
-				    &zs_channels[n_channels+1-2*CHANNEL_A_NR];
-			else
-				zs_soft[n_channels].zs_chan_a =
-				    &zs_channels[n_channels];
-
-			*pp = &zs_soft[n_channels];
-			pp = &zs_soft[n_channels].zs_next;
-			n_channels++;
-		}
-	}
-
-	*pp = 0;
-	zs_channels_found = n_channels;
-
-	for (n = 0; n < zs_channels_found; n++) {
-		for (i = 0; i < 16; i++) {
-			zs_soft[n].zs_channel->curregs[i] = zs_init_regs[i];
-		}
-	}
-
-	spin_lock_irqsave(&zs_lock, flags);
-	for (n = 0; n < zs_channels_found; n++) {
-		if (n % 2 == 0) {
-			write_zsreg(zs_soft[n].zs_chan_a, R9, FHWRES);
-			udelay(10);
-			write_zsreg(zs_soft[n].zs_chan_a, R9, 0);
-		}
-		load_zsregs(zs_soft[n].zs_channel,
-			    zs_soft[n].zs_channel->curregs);
-	}
-	spin_unlock_irqrestore(&zs_lock, flags);
-}
-
-static const struct tty_operations serial_ops = {
-	.open = rs_open,
-	.close = rs_close,
-	.write = rs_write,
-	.flush_chars = rs_flush_chars,
-	.write_room = rs_write_room,
-	.chars_in_buffer = rs_chars_in_buffer,
-	.flush_buffer = rs_flush_buffer,
-	.ioctl = rs_ioctl,
-	.throttle = rs_throttle,
-	.unthrottle = rs_unthrottle,
-	.set_termios = rs_set_termios,
-	.stop = rs_stop,
-	.start = rs_start,
-	.hangup = rs_hangup,
-	.break_ctl = rs_break,
-	.wait_until_sent = rs_wait_until_sent,
-	.tiocmget = rs_tiocmget,
-	.tiocmset = rs_tiocmset,
-};
-
-/* zs_init inits the driver */
-int __init zs_init(void)
-{
-	int channel, i;
-	struct dec_serial *info;
-
-	if(!BUS_PRESENT)
-		return -ENODEV;
-
-	/* Find out how many Z8530 SCCs we have */
-	if (zs_chain == 0)
-		probe_sccs();
-	serial_driver = alloc_tty_driver(zs_channels_found);
-	if (!serial_driver)
-		return -ENOMEM;
-
-	show_serial_version();
-
-	/* Initialize the tty_driver structure */
-	/* Not all of this is exactly right for us. */
-
-	serial_driver->owner = THIS_MODULE;
-	serial_driver->name = "ttyS";
-	serial_driver->major = TTY_MAJOR;
-	serial_driver->minor_start = 64;
-	serial_driver->type = TTY_DRIVER_TYPE_SERIAL;
-	serial_driver->subtype = SERIAL_TYPE_NORMAL;
-	serial_driver->init_termios = tty_std_termios;
-	serial_driver->init_termios.c_cflag =
-		B9600 | CS8 | CREAD | HUPCL | CLOCAL;
-	serial_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV;
-	tty_set_operations(serial_driver, &serial_ops);
-
-	if (tty_register_driver(serial_driver))
-		panic("Couldn't register serial driver");
-
-	for (info = zs_chain, i = 0; info; info = info->zs_next, i++) {
-
-		/* Needed before interrupts are enabled. */
-		info->tty = 0;
-		info->x_char = 0;
-
-		if (info->hook && info->hook->init_info) {
-			(*info->hook->init_info)(info);
-			continue;
-		}
-
-		info->magic = SERIAL_MAGIC;
-		info->port = (int) info->zs_channel->control;
-		info->line = i;
-		info->custom_divisor = 16;
-		info->close_delay = 50;
-		info->closing_wait = 3000;
-		info->event = 0;
-		info->count = 0;
-		info->blocked_open = 0;
-		tasklet_init(&info->tlet, do_softint, (unsigned long)info);
-		init_waitqueue_head(&info->open_wait);
-		init_waitqueue_head(&info->close_wait);
-		printk("ttyS%02d at 0x%08x (irq = %d) is a Z85C30 SCC\n",
-		       info->line, info->port, info->irq);
-		tty_register_device(serial_driver, info->line, NULL);
-
-	}
-
-	for (channel = 0; channel < zs_channels_found; ++channel) {
-		zs_soft[channel].clk_divisor = 16;
-		zs_soft[channel].zs_baud = get_zsbaud(&zs_soft[channel]);
-
-		if (request_irq(zs_soft[channel].irq, rs_interrupt, IRQF_SHARED,
-				"scc", &zs_soft[channel]))
-			printk(KERN_ERR "decserial: can't get irq %d\n",
-			       zs_soft[channel].irq);
-
-		if (zs_soft[channel].hook) {
-			zs_startup(&zs_soft[channel]);
-			if (zs_soft[channel].hook->init_channel)
-				(*zs_soft[channel].hook->init_channel)
-					(&zs_soft[channel]);
-		}
-	}
-
-	return 0;
-}
-
-/*
- * polling I/O routines
- */
-static int zs_poll_tx_char(void *handle, unsigned char ch)
-{
-	struct dec_serial *info = handle;
-	struct dec_zschannel *chan = info->zs_channel;
-	int    ret;
-
-	if(chan) {
-		int loops = 10000;
-
-		while (loops && !(read_zsreg(chan, 0) & Tx_BUF_EMP))
-			loops--;
-
-		if (loops) {
-			write_zsdata(chan, ch);
-			ret = 0;
-		} else
-			ret = -EAGAIN;
-
-		return ret;
-	} else
-		return -ENODEV;
-}
-
-static int zs_poll_rx_char(void *handle)
-{
-	struct dec_serial *info = handle;
-        struct dec_zschannel *chan = info->zs_channel;
-        int    ret;
-
-	if(chan) {
-                int loops = 10000;
-
-		while (loops && !(read_zsreg(chan, 0) & Rx_CH_AV))
-			loops--;
-
-                if (loops)
-                        ret = read_zsdata(chan);
-                else
-                        ret = -EAGAIN;
-
-		return ret;
-	} else
-		return -ENODEV;
-}
-
-int register_zs_hook(unsigned int channel, struct dec_serial_hook *hook)
-{
-	struct dec_serial *info = &zs_soft[channel];
-
-	if (info->hook) {
-		printk("%s: line %d has already a hook registered\n",
-		       __FUNCTION__, channel);
-
-		return 0;
-	} else {
-		hook->poll_rx_char = zs_poll_rx_char;
-		hook->poll_tx_char = zs_poll_tx_char;
-		info->hook = hook;
-
-		return 1;
-	}
-}
-
-int unregister_zs_hook(unsigned int channel)
-{
-	struct dec_serial *info = &zs_soft[channel];
-
-        if (info->hook) {
-                info->hook = NULL;
-                return 1;
-        } else {
-                printk("%s: trying to unregister hook on line %d,"
-                       " but none is registered\n", __FUNCTION__, channel);
-                return 0;
-        }
-}
-
-/*
- * ------------------------------------------------------------
- * Serial console driver
- * ------------------------------------------------------------
- */
-#ifdef CONFIG_SERIAL_DEC_CONSOLE
-
-
-/*
- *	Print a string to the serial port trying not to disturb
- *	any possible real use of the port...
- */
-static void serial_console_write(struct console *co, const char *s,
-				 unsigned count)
-{
-	struct dec_serial *info;
-	int i;
-
-	info = zs_soft + co->index;
-
-	for (i = 0; i < count; i++, s++) {
-		if(*s == '\n')
-			zs_poll_tx_char(info, '\r');
-		zs_poll_tx_char(info, *s);
-	}
-}
-
-static struct tty_driver *serial_console_device(struct console *c, int *index)
-{
-	*index = c->index;
-	return serial_driver;
-}
-
-/*
- *	Setup initial baud/bits/parity. We do two things here:
- *	- construct a cflag setting for the first rs_open()
- *	- initialize the serial port
- *	Return non-zero if we didn't find a serial port.
- */
-static int __init serial_console_setup(struct console *co, char *options)
-{
-	struct dec_serial *info;
-	int baud = 9600;
-	int bits = 8;
-	int parity = 'n';
-	int cflag = CREAD | HUPCL | CLOCAL;
-	int clk_divisor = 16;
-	int brg;
-	char *s;
-	unsigned long flags;
-
-	if(!BUS_PRESENT)
-		return -ENODEV;
-
-	info = zs_soft + co->index;
-
-	if (zs_chain == 0)
-		probe_sccs();
-
-	info->is_cons = 1;
-
-	if (options) {
-		baud = simple_strtoul(options, NULL, 10);
-		s = options;
-		while(*s >= '0' && *s <= '9')
-			s++;
-		if (*s)
-			parity = *s++;
-		if (*s)
-			bits   = *s - '0';
-	}
-
-	/*
-	 *	Now construct a cflag setting.
-	 */
-	switch(baud) {
-	case 1200:
-		cflag |= B1200;
-		break;
-	case 2400:
-		cflag |= B2400;
-		break;
-	case 4800:
-		cflag |= B4800;
-		break;
-	case 19200:
-		cflag |= B19200;
-		break;
-	case 38400:
-		cflag |= B38400;
-		break;
-	case 57600:
-		cflag |= B57600;
-		break;
-	case 115200:
-		cflag |= B115200;
-		break;
-	case 9600:
-	default:
-		cflag |= B9600;
-		/*
-		 * Set this to a sane value to prevent a divide error.
-		 */
-		baud  = 9600;
-		break;
-	}
-	switch(bits) {
-	case 7:
-		cflag |= CS7;
-		break;
-	default:
-	case 8:
-		cflag |= CS8;
-		break;
-	}
-	switch(parity) {
-	case 'o': case 'O':
-		cflag |= PARODD;
-		break;
-	case 'e': case 'E':
-		cflag |= PARENB;
-		break;
-	}
-	co->cflag = cflag;
-
-	spin_lock_irqsave(&zs_lock, flags);
-
-	/*
-	 * Set up the baud rate generator.
-	 */
-	brg = BPS_TO_BRG(baud, zs_parms->clock / clk_divisor);
-	info->zs_channel->curregs[R12] = (brg & 255);
-	info->zs_channel->curregs[R13] = ((brg >> 8) & 255);
-
-	/*
-	 * Set byte size and parity.
-	 */
-	if (bits == 7) {
-		info->zs_channel->curregs[R3] |= Rx7;
-		info->zs_channel->curregs[R5] |= Tx7;
-	} else {
-		info->zs_channel->curregs[R3] |= Rx8;
-		info->zs_channel->curregs[R5] |= Tx8;
-	}
-	if (cflag & PARENB) {
-		info->zs_channel->curregs[R4] |= PAR_ENA;
-	}
-	if (!(cflag & PARODD)) {
-		info->zs_channel->curregs[R4] |= PAR_EVEN;
-	}
-	info->zs_channel->curregs[R4] |= SB1;
-
-	/*
-	 * Turn on RTS and DTR.
-	 */
-	zs_rtsdtr(info, RTS | DTR, 1);
-
-	/*
-	 * Finally, enable sequencing.
-	 */
-	info->zs_channel->curregs[R3] |= RxENABLE;
-	info->zs_channel->curregs[R5] |= TxENAB;
-
-	/*
-	 * Clear the interrupt registers.
-	 */
-	write_zsreg(info->zs_channel, R0, ERR_RES);
-	write_zsreg(info->zs_channel, R0, RES_H_IUS);
-
-	/*
-	 * Load up the new values.
-	 */
-	load_zsregs(info->zs_channel, info->zs_channel->curregs);
-
-	/* Save the current value of RR0 */
-	info->read_reg_zero = read_zsreg(info->zs_channel, R0);
-
-	zs_soft[co->index].clk_divisor = clk_divisor;
-	zs_soft[co->index].zs_baud = get_zsbaud(&zs_soft[co->index]);
-
-	spin_unlock_irqrestore(&zs_lock, flags);
-
-	return 0;
-}
-
-static struct console zs_console = {
-	.name		= "ttyS",
-	.write		= serial_console_write,
-	.device		= serial_console_device,
-	.setup		= serial_console_setup,
-	.flags		= CON_PRINTBUFFER,
-	.index		= -1,
-};
-
-/*
- *	Register console.
- */
-void __init zs_serial_console_init(void)
-{
-	register_console(&zs_console);
-}
-#endif /* ifdef CONFIG_SERIAL_DEC_CONSOLE */
-
-#ifdef CONFIG_KGDB
-struct dec_zschannel *zs_kgdbchan;
-static unsigned char scc_inittab[] = {
-	9,  0x80,	/* reset A side (CHRA) */
-	13, 0,		/* set baud rate divisor */
-	12, 1,
-	14, 1,		/* baud rate gen enable, src=rtxc (BRENABL) */
-	11, 0x50,	/* clocks = br gen (RCBR | TCBR) */
-	5,  0x6a,	/* tx 8 bits, assert RTS (Tx8 | TxENAB | RTS) */
-	4,  0x44,	/* x16 clock, 1 stop (SB1 | X16CLK)*/
-	3,  0xc1,	/* rx enable, 8 bits (RxENABLE | Rx8)*/
-};
-
-/* These are for receiving and sending characters under the kgdb
- * source level kernel debugger.
- */
-void putDebugChar(char kgdb_char)
-{
-	struct dec_zschannel *chan = zs_kgdbchan;
-	while ((read_zsreg(chan, 0) & Tx_BUF_EMP) == 0)
-		RECOVERY_DELAY;
-	write_zsdata(chan, kgdb_char);
-}
-char getDebugChar(void)
-{
-	struct dec_zschannel *chan = zs_kgdbchan;
-	while((read_zsreg(chan, 0) & Rx_CH_AV) == 0)
-		eieio(); /*barrier();*/
-	return read_zsdata(chan);
-}
-void kgdb_interruptible(int yes)
-{
-	struct dec_zschannel *chan = zs_kgdbchan;
-	int one, nine;
-	nine = read_zsreg(chan, 9);
-	if (yes == 1) {
-		one = EXT_INT_ENAB|RxINT_ALL;
-		nine |= MIE;
-		printk("turning serial ints on\n");
-	} else {
-		one = RxINT_DISAB;
-		nine &= ~MIE;
-		printk("turning serial ints off\n");
-	}
-	write_zsreg(chan, 1, one);
-	write_zsreg(chan, 9, nine);
-}
-
-static int kgdbhook_init_channel(void *handle)
-{
-	return 0;
-}
-
-static void kgdbhook_init_info(void *handle)
-{
-}
-
-static void kgdbhook_rx_char(void *handle, unsigned char ch, unsigned char fl)
-{
-	struct dec_serial *info = handle;
-
-	if (fl != TTY_NORMAL)
-		return;
-	if (ch == 0x03 || ch == '$')
-		breakpoint();
-}
-
-/* This sets up the serial port we're using, and turns on
- * interrupts for that channel, so kgdb is usable once we're done.
- */
-static inline void kgdb_chaninit(struct dec_zschannel *ms, int intson, int bps)
-{
-	int brg;
-	int i, x;
-	volatile char *sccc = ms->control;
-	brg = BPS_TO_BRG(bps, zs_parms->clock/16);
-	printk("setting bps on kgdb line to %d [brg=%x]\n", bps, brg);
-	for (i = 20000; i != 0; --i) {
-		x = *sccc; eieio();
-	}
-	for (i = 0; i < sizeof(scc_inittab); ++i) {
-		write_zsreg(ms, scc_inittab[i], scc_inittab[i+1]);
-		i++;
-	}
-}
-/* This is called at boot time to prime the kgdb serial debugging
- * serial line.  The 'tty_num' argument is 0 for /dev/ttya and 1
- * for /dev/ttyb which is determined in setup_arch() from the
- * boot command line flags.
- */
-struct dec_serial_hook zs_kgdbhook = {
-	.init_channel	= kgdbhook_init_channel,
-	.init_info	= kgdbhook_init_info,
-	.rx_char	= kgdbhook_rx_char,
-	.cflags		= B38400 | CS8 | CLOCAL,
-};
-
-void __init zs_kgdb_hook(int tty_num)
-{
-	/* Find out how many Z8530 SCCs we have */
-	if (zs_chain == 0)
-		probe_sccs();
-	zs_soft[tty_num].zs_channel = &zs_channels[tty_num];
-	zs_kgdbchan = zs_soft[tty_num].zs_channel;
-	zs_soft[tty_num].change_needed = 0;
-	zs_soft[tty_num].clk_divisor = 16;
-	zs_soft[tty_num].zs_baud = 38400;
- 	zs_soft[tty_num].hook = &zs_kgdbhook; /* This runs kgdb */
-	/* Turn on transmitter/receiver at 8-bits/char */
-        kgdb_chaninit(zs_soft[tty_num].zs_channel, 1, 38400);
-	printk("KGDB: on channel %d initialized\n", tty_num);
-	set_debug_traps(); /* init stub */
-}
-#endif /* ifdef CONFIG_KGDB */
diff --git a/drivers/tc/zs.h b/drivers/tc/zs.h
deleted file mode 100644
index 13512200ceba..000000000000
--- a/drivers/tc/zs.h
+++ /dev/null
@@ -1,404 +0,0 @@
-/*
- * drivers/tc/zs.h: Definitions for the DECstation Z85C30 serial driver.
- *
- * Adapted from drivers/sbus/char/sunserial.h by Paul Mackerras.
- * Adapted from drivers/macintosh/macserial.h by Harald Koerfgen.
- *
- * Copyright (C) 1996 Paul Mackerras (Paul.Mackerras@cs.anu.edu.au)
- * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
- * Copyright (C) 2004, 2005  Maciej W. Rozycki
- */
-#ifndef _DECSERIAL_H
-#define _DECSERIAL_H
-
-#include <asm/dec/serial.h>
-
-#define NUM_ZSREGS 16
-
-struct serial_struct {
-	int	type;
-	int	line;
-	int	port;
-	int	irq;
-	int	flags;
-	int	xmit_fifo_size;
-	int	custom_divisor;
-	int	baud_base;
-	unsigned short	close_delay;
-	char	reserved_char[2];
-	int	hub6;
-	unsigned short	closing_wait; /* time to wait before closing */
-	unsigned short	closing_wait2; /* no longer used... */
-	int	reserved[4];
-};
-
-/*
- * For the close wait times, 0 means wait forever for serial port to
- * flush its output.  65535 means don't wait at all.
- */
-#define ZILOG_CLOSING_WAIT_INF	0
-#define ZILOG_CLOSING_WAIT_NONE	65535
-
-/*
- * Definitions for ZILOG_struct (and serial_struct) flags field
- */
-#define ZILOG_HUP_NOTIFY 0x0001 /* Notify getty on hangups and closes
-				   on the callout port */
-#define ZILOG_FOURPORT  0x0002	/* Set OU1, OUT2 per AST Fourport settings */
-#define ZILOG_SAK	0x0004	/* Secure Attention Key (Orange book) */
-#define ZILOG_SPLIT_TERMIOS 0x0008 /* Separate termios for dialin/callout */
-
-#define ZILOG_SPD_MASK	0x0030
-#define ZILOG_SPD_HI	0x0010	/* Use 56000 instead of 38400 bps */
-
-#define ZILOG_SPD_VHI	0x0020  /* Use 115200 instead of 38400 bps */
-#define ZILOG_SPD_CUST	0x0030  /* Use user-specified divisor */
-
-#define ZILOG_SKIP_TEST	0x0040 /* Skip UART test during autoconfiguration */
-#define ZILOG_AUTO_IRQ  0x0080 /* Do automatic IRQ during autoconfiguration */
-#define ZILOG_SESSION_LOCKOUT 0x0100 /* Lock out cua opens based on session */
-#define ZILOG_PGRP_LOCKOUT    0x0200 /* Lock out cua opens based on pgrp */
-#define ZILOG_CALLOUT_NOHUP   0x0400 /* Don't do hangups for cua device */
-
-#define ZILOG_FLAGS	0x0FFF	/* Possible legal ZILOG flags */
-#define ZILOG_USR_MASK 0x0430	/* Legal flags that non-privileged
-				 * users can set or reset */
-
-/* Internal flags used only by kernel/chr_drv/serial.c */
-#define ZILOG_INITIALIZED	0x80000000 /* Serial port was initialized */
-#define ZILOG_CALLOUT_ACTIVE	0x40000000 /* Call out device is active */
-#define ZILOG_NORMAL_ACTIVE	0x20000000 /* Normal device is active */
-#define ZILOG_BOOT_AUTOCONF	0x10000000 /* Autoconfigure port on bootup */
-#define ZILOG_CLOSING		0x08000000 /* Serial port is closing */
-#define ZILOG_CTS_FLOW		0x04000000 /* Do CTS flow control */
-#define ZILOG_CHECK_CD		0x02000000 /* i.e., CLOCAL */
-
-/* Software state per channel */
-
-#ifdef __KERNEL__
-/*
- * This is our internal structure for each serial port's state.
- *
- * Many fields are paralleled by the structure used by the serial_struct
- * structure.
- *
- * For definitions of the flags field, see tty.h
- */
-
-struct dec_zschannel {
-	volatile unsigned char *control;
-	volatile unsigned char *data;
-
-	/* Current write register values */
-	unsigned char curregs[NUM_ZSREGS];
-};
-
-struct dec_serial {
-	struct dec_serial	*zs_next;	/* For IRQ servicing chain.  */
-	struct dec_zschannel	*zs_channel;	/* Channel registers.  */
-	struct dec_zschannel	*zs_chan_a;	/* A side registers.  */
-	unsigned char		read_reg_zero;
-
-	struct dec_serial_hook	*hook;		/* Hook on this channel.  */
-	int			tty_break;	/* Set on BREAK condition.  */
-	int			is_cons;	/* Is this our console.  */
-	int			tx_active;	/* Char is being xmitted.  */
-	int			tx_stopped;	/* Output is suspended.  */
-
-	/*
-	 * We need to know the current clock divisor
-	 * to read the bps rate the chip has currently loaded.
-	 */
-	int			clk_divisor;	/* May be 1, 16, 32, or 64.  */
-	int			zs_baud;
-
-	char			change_needed;
-
-	int			magic;
-	int			baud_base;
-	int			port;
-	int			irq;
-	int			flags; 		/* Defined in tty.h.  */
-	int			type; 		/* UART type.  */
-	struct tty_struct 	*tty;
-	int			read_status_mask;
-	int			ignore_status_mask;
-	int			timeout;
-	int			xmit_fifo_size;
-	int			custom_divisor;
-	int			x_char;		/* XON/XOFF character.  */
-	int			close_delay;
-	unsigned short		closing_wait;
-	unsigned short		closing_wait2;
-	unsigned long		event;
-	unsigned long		last_active;
-	int			line;
-	int			count;		/* # of fds on device.  */
-	int			blocked_open;	/* # of blocked opens.  */
-	unsigned char 		*xmit_buf;
-	int			xmit_head;
-	int			xmit_tail;
-	int			xmit_cnt;
-	struct tasklet_struct	tlet;
-	wait_queue_head_t	open_wait;
-	wait_queue_head_t	close_wait;
-};
-
-
-#define SERIAL_MAGIC 0x5301
-
-/*
- * The size of the serial xmit buffer is 1 page, or 4096 bytes
- */
-#define SERIAL_XMIT_SIZE 4096
-
-/*
- * Events are used to schedule things to happen at timer-interrupt
- * time, instead of at rs interrupt time.
- */
-#define RS_EVENT_WRITE_WAKEUP	0
-
-#endif /* __KERNEL__ */
-
-/* Conversion routines to/from brg time constants from/to bits
- * per second.
- */
-#define BRG_TO_BPS(brg, freq) ((freq) / 2 / ((brg) + 2))
-#define BPS_TO_BRG(bps, freq) ((((freq) + (bps)) / (2 * (bps))) - 2)
-
-/* The Zilog register set */
-
-#define	FLAG	0x7e
-
-/* Write Register 0 */
-#define	R0	0		/* Register selects */
-#define	R1	1
-#define	R2	2
-#define	R3	3
-#define	R4	4
-#define	R5	5
-#define	R6	6
-#define	R7	7
-#define	R8	8
-#define	R9	9
-#define	R10	10
-#define	R11	11
-#define	R12	12
-#define	R13	13
-#define	R14	14
-#define	R15	15
-
-#define	NULLCODE	0	/* Null Code */
-#define	POINT_HIGH	0x8	/* Select upper half of registers */
-#define	RES_EXT_INT	0x10	/* Reset Ext. Status Interrupts */
-#define	SEND_ABORT	0x18	/* HDLC Abort */
-#define	RES_RxINT_FC	0x20	/* Reset RxINT on First Character */
-#define	RES_Tx_P	0x28	/* Reset TxINT Pending */
-#define	ERR_RES		0x30	/* Error Reset */
-#define	RES_H_IUS	0x38	/* Reset highest IUS */
-
-#define	RES_Rx_CRC	0x40	/* Reset Rx CRC Checker */
-#define	RES_Tx_CRC	0x80	/* Reset Tx CRC Checker */
-#define	RES_EOM_L	0xC0	/* Reset EOM latch */
-
-/* Write Register 1 */
-
-#define	EXT_INT_ENAB	0x1	/* Ext Int Enable */
-#define	TxINT_ENAB	0x2	/* Tx Int Enable */
-#define	PAR_SPEC	0x4	/* Parity is special condition */
-
-#define	RxINT_DISAB	0	/* Rx Int Disable */
-#define	RxINT_FCERR	0x8	/* Rx Int on First Character Only or Error */
-#define	RxINT_ALL	0x10	/* Int on all Rx Characters or error */
-#define	RxINT_ERR	0x18	/* Int on error only */
-#define	RxINT_MASK	0x18
-
-#define	WT_RDY_RT	0x20	/* Wait/Ready on R/T */
-#define	WT_FN_RDYFN	0x40	/* Wait/FN/Ready FN */
-#define	WT_RDY_ENAB	0x80	/* Wait/Ready Enable */
-
-/* Write Register #2 (Interrupt Vector) */
-
-/* Write Register 3 */
-
-#define	RxENABLE	0x1	/* Rx Enable */
-#define	SYNC_L_INH	0x2	/* Sync Character Load Inhibit */
-#define	ADD_SM		0x4	/* Address Search Mode (SDLC) */
-#define	RxCRC_ENAB	0x8	/* Rx CRC Enable */
-#define	ENT_HM		0x10	/* Enter Hunt Mode */
-#define	AUTO_ENAB	0x20	/* Auto Enables */
-#define	Rx5		0x0	/* Rx 5 Bits/Character */
-#define	Rx7		0x40	/* Rx 7 Bits/Character */
-#define	Rx6		0x80	/* Rx 6 Bits/Character */
-#define	Rx8		0xc0	/* Rx 8 Bits/Character */
-#define RxNBITS_MASK	0xc0
-
-/* Write Register 4 */
-
-#define	PAR_ENA		0x1	/* Parity Enable */
-#define	PAR_EVEN	0x2	/* Parity Even/Odd* */
-
-#define	SYNC_ENAB	0	/* Sync Modes Enable */
-#define	SB1		0x4	/* 1 stop bit/char */
-#define	SB15		0x8	/* 1.5 stop bits/char */
-#define	SB2		0xc	/* 2 stop bits/char */
-#define SB_MASK		0xc
-
-#define	MONSYNC		0	/* 8 Bit Sync character */
-#define	BISYNC		0x10	/* 16 bit sync character */
-#define	SDLC		0x20	/* SDLC Mode (01111110 Sync Flag) */
-#define	EXTSYNC		0x30	/* External Sync Mode */
-
-#define	X1CLK		0x0	/* x1 clock mode */
-#define	X16CLK		0x40	/* x16 clock mode */
-#define	X32CLK		0x80	/* x32 clock mode */
-#define	X64CLK		0xC0	/* x64 clock mode */
-#define XCLK_MASK	0xC0
-
-/* Write Register 5 */
-
-#define	TxCRC_ENAB	0x1	/* Tx CRC Enable */
-#define	RTS		0x2	/* RTS */
-#define	SDLC_CRC	0x4	/* SDLC/CRC-16 */
-#define	TxENAB		0x8	/* Tx Enable */
-#define	SND_BRK		0x10	/* Send Break */
-#define	Tx5		0x0	/* Tx 5 bits (or less)/character */
-#define	Tx7		0x20	/* Tx 7 bits/character */
-#define	Tx6		0x40	/* Tx 6 bits/character */
-#define	Tx8		0x60	/* Tx 8 bits/character */
-#define TxNBITS_MASK	0x60
-#define	DTR		0x80	/* DTR */
-
-/* Write Register 6 (Sync bits 0-7/SDLC Address Field) */
-
-/* Write Register 7 (Sync bits 8-15/SDLC 01111110) */
-
-/* Write Register 8 (transmit buffer) */
-
-/* Write Register 9 (Master interrupt control) */
-#define	VIS	1	/* Vector Includes Status */
-#define	NV	2	/* No Vector */
-#define	DLC	4	/* Disable Lower Chain */
-#define	MIE	8	/* Master Interrupt Enable */
-#define	STATHI	0x10	/* Status high */
-#define	SOFTACK	0x20	/* Software Interrupt Acknowledge */
-#define	NORESET	0	/* No reset on write to R9 */
-#define	CHRB	0x40	/* Reset channel B */
-#define	CHRA	0x80	/* Reset channel A */
-#define	FHWRES	0xc0	/* Force hardware reset */
-
-/* Write Register 10 (misc control bits) */
-#define	BIT6	1	/* 6 bit/8bit sync */
-#define	LOOPMODE 2	/* SDLC Loop mode */
-#define	ABUNDER	4	/* Abort/flag on SDLC xmit underrun */
-#define	MARKIDLE 8	/* Mark/flag on idle */
-#define	GAOP	0x10	/* Go active on poll */
-#define	NRZ	0	/* NRZ mode */
-#define	NRZI	0x20	/* NRZI mode */
-#define	FM1	0x40	/* FM1 (transition = 1) */
-#define	FM0	0x60	/* FM0 (transition = 0) */
-#define	CRCPS	0x80	/* CRC Preset I/O */
-
-/* Write Register 11 (Clock Mode control) */
-#define	TRxCXT	0	/* TRxC = Xtal output */
-#define	TRxCTC	1	/* TRxC = Transmit clock */
-#define	TRxCBR	2	/* TRxC = BR Generator Output */
-#define	TRxCDP	3	/* TRxC = DPLL output */
-#define	TRxCOI	4	/* TRxC O/I */
-#define	TCRTxCP	0	/* Transmit clock = RTxC pin */
-#define	TCTRxCP	8	/* Transmit clock = TRxC pin */
-#define	TCBR	0x10	/* Transmit clock = BR Generator output */
-#define	TCDPLL	0x18	/* Transmit clock = DPLL output */
-#define	RCRTxCP	0	/* Receive clock = RTxC pin */
-#define	RCTRxCP	0x20	/* Receive clock = TRxC pin */
-#define	RCBR	0x40	/* Receive clock = BR Generator output */
-#define	RCDPLL	0x60	/* Receive clock = DPLL output */
-#define	RTxCX	0x80	/* RTxC Xtal/No Xtal */
-
-/* Write Register 12 (lower byte of baud rate generator time constant) */
-
-/* Write Register 13 (upper byte of baud rate generator time constant) */
-
-/* Write Register 14 (Misc control bits) */
-#define	BRENABL	1	/* Baud rate generator enable */
-#define	BRSRC	2	/* Baud rate generator source */
-#define	DTRREQ	4	/* DTR/Request function */
-#define	AUTOECHO 8	/* Auto Echo */
-#define	LOOPBAK	0x10	/* Local loopback */
-#define	SEARCH	0x20	/* Enter search mode */
-#define	RMC	0x40	/* Reset missing clock */
-#define	DISDPLL	0x60	/* Disable DPLL */
-#define	SSBR	0x80	/* Set DPLL source = BR generator */
-#define	SSRTxC	0xa0	/* Set DPLL source = RTxC */
-#define	SFMM	0xc0	/* Set FM mode */
-#define	SNRZI	0xe0	/* Set NRZI mode */
-
-/* Write Register 15 (external/status interrupt control) */
-#define	ZCIE	2	/* Zero count IE */
-#define	DCDIE	8	/* DCD IE */
-#define	SYNCIE	0x10	/* Sync/hunt IE */
-#define	CTSIE	0x20	/* CTS IE */
-#define	TxUIE	0x40	/* Tx Underrun/EOM IE */
-#define	BRKIE	0x80	/* Break/Abort IE */
-
-
-/* Read Register 0 */
-#define	Rx_CH_AV	0x1	/* Rx Character Available */
-#define	ZCOUNT		0x2	/* Zero count */
-#define	Tx_BUF_EMP	0x4	/* Tx Buffer empty */
-#define	DCD		0x8	/* DCD */
-#define	SYNC_HUNT	0x10	/* Sync/hunt */
-#define	CTS		0x20	/* CTS */
-#define	TxEOM		0x40	/* Tx underrun */
-#define	BRK_ABRT	0x80	/* Break/Abort */
-
-/* Read Register 1 */
-#define	ALL_SNT		0x1	/* All sent */
-/* Residue Data for 8 Rx bits/char programmed */
-#define	RES3		0x8	/* 0/3 */
-#define	RES4		0x4	/* 0/4 */
-#define	RES5		0xc	/* 0/5 */
-#define	RES6		0x2	/* 0/6 */
-#define	RES7		0xa	/* 0/7 */
-#define	RES8		0x6	/* 0/8 */
-#define	RES18		0xe	/* 1/8 */
-#define	RES28		0x0	/* 2/8 */
-/* Special Rx Condition Interrupts */
-#define	PAR_ERR		0x10	/* Parity error */
-#define	Rx_OVR		0x20	/* Rx Overrun Error */
-#define	FRM_ERR		0x40	/* CRC/Framing Error */
-#define	END_FR		0x80	/* End of Frame (SDLC) */
-
-/* Read Register 2 (channel b only) - Interrupt vector */
-
-/* Read Register 3 (interrupt pending register) ch a only */
-#define	CHBEXT	0x1		/* Channel B Ext/Stat IP */
-#define	CHBTxIP	0x2		/* Channel B Tx IP */
-#define	CHBRxIP	0x4		/* Channel B Rx IP */
-#define	CHAEXT	0x8		/* Channel A Ext/Stat IP */
-#define	CHATxIP	0x10		/* Channel A Tx IP */
-#define	CHARxIP	0x20		/* Channel A Rx IP */
-
-/* Read Register 8 (receive data register) */
-
-/* Read Register 10  (misc status bits) */
-#define	ONLOOP	2		/* On loop */
-#define	LOOPSEND 0x10		/* Loop sending */
-#define	CLK2MIS	0x40		/* Two clocks missing */
-#define	CLK1MIS	0x80		/* One clock missing */
-
-/* Read Register 12 (lower byte of baud rate generator constant) */
-
-/* Read Register 13 (upper byte of baud rate generator constant) */
-
-/* Read Register 15 (value of WR 15) */
-
-/* Misc macros */
-#define ZS_CLEARERR(channel)	(write_zsreg(channel, 0, ERR_RES))
-#define ZS_CLEARFIFO(channel)	do { volatile unsigned char garbage; \
-				     garbage = read_zsdata(channel); \
-				     garbage = read_zsdata(channel); \
-				     garbage = read_zsdata(channel); \
-				} while(0)
-
-#endif /* !(_DECSERIAL_H) */
diff --git a/include/asm-mips/dec/serial.h b/include/asm-mips/dec/serial.h
deleted file mode 100644
index acad75890a05..000000000000
--- a/include/asm-mips/dec/serial.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- *	include/asm-mips/dec/serial.h
- *
- *	Definitions common to all DECstation serial devices.
- *
- *	Copyright (C) 2004  Maciej W. Rozycki
- *
- *	Based on bits extracted from drivers/tc/zs.h for which
- *	the following copyrights apply:
- *
- *	Copyright (C) 1995  David S. Miller (davem@caip.rutgers.edu)
- *	Copyright (C) 1996  Paul Mackerras (Paul.Mackerras@cs.anu.edu.au)
- *	Copyright (C)       Harald Koerfgen
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- */
-#ifndef __ASM_MIPS_DEC_SERIAL_H
-#define __ASM_MIPS_DEC_SERIAL_H
-
-struct dec_serial_hook {
-	int (*init_channel)(void *handle);
-	void (*init_info)(void *handle);
-	void (*rx_char)(unsigned char ch, unsigned char fl);
-	int (*poll_rx_char)(void *handle);
-	int (*poll_tx_char)(void *handle, unsigned char ch);
-	unsigned int cflags;
-};
-
-extern int register_dec_serial_hook(unsigned int channel,
-				    struct dec_serial_hook *hook);
-extern int unregister_dec_serial_hook(unsigned int channel);
-
-#endif /* __ASM_MIPS_DEC_SERIAL_H */
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 9c721cd2c9d6..773d8d8828ad 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -62,8 +62,9 @@
 /* NEC v850.  */
 #define PORT_V850E_UART	40
 
-/* DZ */
-#define PORT_DZ		47
+/* DEC */
+#define PORT_DZ		46
+#define PORT_ZS		47
 
 /* Parisc type numbers. */
 #define PORT_MUX	48
-- 
cgit v1.3-8-gc7d7


From 1e66df3ee301209f4a38df097d7cc5cb9b367a3f Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:02 -0700
Subject: add kstrndup

Add a kstrndup function, modelled on strndup.  Like strndup this
returns a string copied into its own allocated memory, but it copies
no more than the specified number of bytes from the source.

Remove private strndup() from irda code.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@mandriva.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Panagiotis Issaris <takis@issaris.org>
Cc: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
---
 include/linux/string.h  |  1 +
 mm/util.c               | 26 ++++++++++++++++++++++++--
 net/irda/irias_object.c | 43 +++++--------------------------------------
 3 files changed, 30 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index 7f2eb6a477f9..ee5e9ccc4aae 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -105,6 +105,7 @@ extern void * memchr(const void *,int,__kernel_size_t);
 #endif
 
 extern char *kstrdup(const char *s, gfp_t gfp);
+extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
 extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
 
 #ifdef __cplusplus
diff --git a/mm/util.c b/mm/util.c
index 78f3783bdcc8..bf340d806868 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -6,7 +6,6 @@
 
 /**
  * kstrdup - allocate space for and copy an existing string
- *
  * @s: the string to duplicate
  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
  */
@@ -26,6 +25,30 @@ char *kstrdup(const char *s, gfp_t gfp)
 }
 EXPORT_SYMBOL(kstrdup);
 
+/**
+ * kstrndup - allocate space for and copy an existing string
+ * @s: the string to duplicate
+ * @max: read at most @max chars from @s
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kstrndup(const char *s, size_t max, gfp_t gfp)
+{
+	size_t len;
+	char *buf;
+
+	if (!s)
+		return NULL;
+
+	len = strnlen(s, max);
+	buf = kmalloc_track_caller(len+1, gfp);
+	if (buf) {
+		memcpy(buf, s, len);
+		buf[len] = '\0';
+	}
+	return buf;
+}
+EXPORT_SYMBOL(kstrndup);
+
 /**
  * kmemdup - duplicate region of memory
  *
@@ -80,7 +103,6 @@ EXPORT_SYMBOL(krealloc);
 
 /*
  * strndup_user - duplicate an existing string from user space
- *
  * @s: The string to duplicate
  * @n: Maximum number of bytes to copy, including the trailing NUL.
  */
diff --git a/net/irda/irias_object.c b/net/irda/irias_object.c
index 4adaae242b9e..cf302457097b 100644
--- a/net/irda/irias_object.c
+++ b/net/irda/irias_object.c
@@ -36,39 +36,6 @@ hashbin_t *irias_objects;
  */
 struct ias_value irias_missing = { IAS_MISSING, 0, 0, 0, {0}};
 
-/*
- * Function strndup (str, max)
- *
- *    My own kernel version of strndup!
- *
- * Faster, check boundary... Jean II
- */
-static char *strndup(char *str, size_t max)
-{
-	char *new_str;
-	int len;
-
-	/* Check string */
-	if (str == NULL)
-		return NULL;
-	/* Check length, truncate */
-	len = strlen(str);
-	if(len > max)
-		len = max;
-
-	/* Allocate new string */
-	new_str = kmalloc(len + 1, GFP_ATOMIC);
-	if (new_str == NULL) {
-		IRDA_WARNING("%s: Unable to kmalloc!\n", __FUNCTION__);
-		return NULL;
-	}
-
-	/* Copy and truncate */
-	memcpy(new_str, str, len);
-	new_str[len] = '\0';
-
-	return new_str;
-}
 
 /*
  * Function ias_new_object (name, id)
@@ -90,7 +57,7 @@ struct ias_object *irias_new_object( char *name, int id)
 	}
 
 	obj->magic = IAS_OBJECT_MAGIC;
-	obj->name = strndup(name, IAS_MAX_CLASSNAME);
+	obj->name = kstrndup(name, IAS_MAX_CLASSNAME, GFP_ATOMIC);
 	if (!obj->name) {
 		IRDA_WARNING("%s(), Unable to allocate name!\n",
 			     __FUNCTION__);
@@ -360,7 +327,7 @@ void irias_add_integer_attrib(struct ias_object *obj, char *name, int value,
 	}
 
 	attrib->magic = IAS_ATTRIB_MAGIC;
-	attrib->name = strndup(name, IAS_MAX_ATTRIBNAME);
+	attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC);
 
 	/* Insert value */
 	attrib->value = irias_new_integer_value(value);
@@ -404,7 +371,7 @@ void irias_add_octseq_attrib(struct ias_object *obj, char *name, __u8 *octets,
 	}
 
 	attrib->magic = IAS_ATTRIB_MAGIC;
-	attrib->name = strndup(name, IAS_MAX_ATTRIBNAME);
+	attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC);
 
 	attrib->value = irias_new_octseq_value( octets, len);
 	if (!attrib->name || !attrib->value) {
@@ -446,7 +413,7 @@ void irias_add_string_attrib(struct ias_object *obj, char *name, char *value,
 	}
 
 	attrib->magic = IAS_ATTRIB_MAGIC;
-	attrib->name = strndup(name, IAS_MAX_ATTRIBNAME);
+	attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC);
 
 	attrib->value = irias_new_string_value(value);
 	if (!attrib->name || !attrib->value) {
@@ -506,7 +473,7 @@ struct ias_value *irias_new_string_value(char *string)
 
 	value->type = IAS_STRING;
 	value->charset = CS_ASCII;
-	value->t.string = strndup(string, IAS_MAX_STRING);
+	value->t.string = kstrndup(string, IAS_MAX_STRING, GFP_ATOMIC);
 	if (!value->t.string) {
 		IRDA_WARNING("%s: Unable to kmalloc!\n", __FUNCTION__);
 		kfree(value);
-- 
cgit v1.3-8-gc7d7


From d84d1cc7647c7e4f77d517e2d87b4a106a0420d9 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:02 -0700
Subject: add argv_split()

argv_split() is a helper function which takes a string, splits it at
whitespace, and returns a NULL-terminated argv vector.  This is
deliberately simple - it does no quote processing of any kind.

[ Seems to me that this is something which is already being done in
  the kernel, but I couldn't find any other implementations, either to
  steal or replace.  Keep an eye out. ]

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
---
 include/linux/string.h |   3 ++
 lib/Makefile           |   2 +-
 lib/argv_split.c       | 105 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 lib/argv_split.c

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index ee5e9ccc4aae..836062b7582a 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -108,6 +108,9 @@ extern char *kstrdup(const char *s, gfp_t gfp);
 extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
 extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
 
+extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
+extern void argv_free(char **argv);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/Makefile b/lib/Makefile
index da68b2ca0606..614966387402 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -5,7 +5,7 @@
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o \
 	 idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
-	 sha1.o irq_regs.o reciprocal_div.o
+	 sha1.o irq_regs.o reciprocal_div.o argv_split.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/argv_split.c b/lib/argv_split.c
new file mode 100644
index 000000000000..4096ed42f490
--- /dev/null
+++ b/lib/argv_split.c
@@ -0,0 +1,105 @@
+/*
+ * Helper function for splitting a string into an argv-like array.
+ */
+
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/bug.h>
+
+static const char *skip_sep(const char *cp)
+{
+	while (*cp && isspace(*cp))
+		cp++;
+
+	return cp;
+}
+
+static const char *skip_arg(const char *cp)
+{
+	while (*cp && !isspace(*cp))
+		cp++;
+
+	return cp;
+}
+
+static int count_argc(const char *str)
+{
+	int count = 0;
+
+	while (*str) {
+		str = skip_sep(str);
+		if (*str) {
+			count++;
+			str = skip_arg(str);
+		}
+	}
+
+	return count;
+}
+
+/**
+ * argv_free - free an argv
+ * @argv - the argument vector to be freed
+ *
+ * Frees an argv and the strings it points to.
+ */
+void argv_free(char **argv)
+{
+	char **p;
+	for (p = argv; *p; p++)
+		kfree(*p);
+
+	kfree(argv);
+}
+EXPORT_SYMBOL(argv_free);
+
+/**
+ * argv_split - split a string at whitespace, returning an argv
+ * @gfp: the GFP mask used to allocate memory
+ * @str: the string to be split
+ * @argcp: returned argument count
+ *
+ * Returns an array of pointers to strings which are split out from
+ * @str.  This is performed by strictly splitting on white-space; no
+ * quote processing is performed.  Multiple whitespace characters are
+ * considered to be a single argument separator.  The returned array
+ * is always NULL-terminated.  Returns NULL on memory allocation
+ * failure.
+ */
+char **argv_split(gfp_t gfp, const char *str, int *argcp)
+{
+	int argc = count_argc(str);
+	char **argv = kzalloc(sizeof(*argv) * (argc+1), gfp);
+	char **argvp;
+
+	if (argv == NULL)
+		goto out;
+
+	*argcp = argc;
+	argvp = argv;
+
+	while (*str) {
+		str = skip_sep(str);
+
+		if (*str) {
+			const char *p = str;
+			char *t;
+
+			str = skip_arg(str);
+
+			t = kstrndup(p, str-p, gfp);
+			if (t == NULL)
+				goto fail;
+			*argvp++ = t;
+		}
+	}
+	*argvp = NULL;
+
+  out:
+	return argv;
+
+  fail:
+	argv_free(argv);
+	return NULL;
+}
+EXPORT_SYMBOL(argv_split);
-- 
cgit v1.3-8-gc7d7


From 0ab4dc92278a0f3816e486d6350c6652a72e06c8 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:02 -0700
Subject: usermodehelper: split setup from execution

Rather than having hundreds of variations of call_usermodehelper for
various pieces of usermode state which could be set up, split the
info allocation and initialization from the actual process execution.

This means the general pattern becomes:
 info = call_usermodehelper_setup(path, argv, envp); /* basic state */
 call_usermodehelper_<SET EXTRA STATE>(info, stuff...);	/* extra state */
 call_usermodehelper_exec(info, wait);	/* run process and free info */

This patch introduces wrappers for all the existing calling styles for
call_usermodehelper_*, but folds their implementations into one.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: David Howells <dhowells@redhat.com>
Cc: Bj?rn Steinbrink <B.Steinbrink@gmx.de>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
---
 include/linux/kmod.h |  44 +++++++++++-
 kernel/kmod.c        | 191 ++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 176 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 10f505c8431d..c4cbe59d9c67 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -36,13 +36,51 @@ static inline int request_module(const char * name, ...) { return -ENOSYS; }
 #define try_then_request_module(x, mod...) ((x) ?: (request_module(mod), (x)))
 
 struct key;
-extern int call_usermodehelper_keys(char *path, char *argv[], char *envp[],
-				    struct key *session_keyring, int wait);
+struct file;
+struct subprocess_info;
+
+/* Allocate a subprocess_info structure */
+struct subprocess_info *call_usermodehelper_setup(char *path,
+						  char **argv, char **envp);
+
+/* Set various pieces of state into the subprocess_info structure */
+void call_usermodehelper_setkeys(struct subprocess_info *info,
+				 struct key *session_keyring);
+int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
+				  struct file **filp);
+void call_usermodehelper_setcleanup(struct subprocess_info *info,
+				    void (*cleanup)(char **argv, char **envp));
+
+/* Actually execute the sub-process */
+int call_usermodehelper_exec(struct subprocess_info *info, int wait);
+
+/* Free the subprocess_info. This is only needed if you're not going
+   to call call_usermodehelper_exec */
+void call_usermodehelper_freeinfo(struct subprocess_info *info);
 
 static inline int
 call_usermodehelper(char *path, char **argv, char **envp, int wait)
 {
-	return call_usermodehelper_keys(path, argv, envp, NULL, wait);
+	struct subprocess_info *info;
+
+	info = call_usermodehelper_setup(path, argv, envp);
+	if (info == NULL)
+		return -ENOMEM;
+	return call_usermodehelper_exec(info, wait);
+}
+
+static inline int
+call_usermodehelper_keys(char *path, char **argv, char **envp,
+			 struct key *session_keyring, int wait)
+{
+	struct subprocess_info *info;
+
+	info = call_usermodehelper_setup(path, argv, envp);
+	if (info == NULL)
+		return -ENOMEM;
+
+	call_usermodehelper_setkeys(info, session_keyring);
+	return call_usermodehelper_exec(info, wait);
 }
 
 extern void usermodehelper_init(void);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 4d32eb077179..d2dce71115d8 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -122,6 +122,7 @@ struct subprocess_info {
 	int wait;
 	int retval;
 	struct file *stdin;
+	void (*cleanup)(char **argv, char **envp);
 };
 
 /*
@@ -180,6 +181,14 @@ static int ____call_usermodehelper(void *data)
 	do_exit(0);
 }
 
+void call_usermodehelper_freeinfo(struct subprocess_info *info)
+{
+	if (info->cleanup)
+		(*info->cleanup)(info->argv, info->envp);
+	kfree(info);
+}
+EXPORT_SYMBOL(call_usermodehelper_freeinfo);
+
 /* Keventd can't block, but this (a child) can. */
 static int wait_for_helper(void *data)
 {
@@ -217,7 +226,7 @@ static int wait_for_helper(void *data)
 	}
 
 	if (sub_info->wait < 0)
-		kfree(sub_info);
+		call_usermodehelper_freeinfo(sub_info);
 	else
 		complete(sub_info->complete);
 	return 0;
@@ -252,11 +261,94 @@ static void __call_usermodehelper(struct work_struct *work)
 }
 
 /**
- * call_usermodehelper_keys - start a usermode application
- * @path: pathname for the application
- * @argv: null-terminated argument list
- * @envp: null-terminated environment list
- * @session_keyring: session keyring for process (NULL for an empty keyring)
+ * call_usermodehelper_setup - prepare to call a usermode helper
+ * @path - path to usermode executable
+ * @argv - arg vector for process
+ * @envp - environment for process
+ *
+ * Returns either NULL on allocation failure, or a subprocess_info
+ * structure.  This should be passed to call_usermodehelper_exec to
+ * exec the process and free the structure.
+ */
+struct subprocess_info *call_usermodehelper_setup(char *path,
+						  char **argv, char **envp)
+{
+	struct subprocess_info *sub_info;
+	sub_info = kzalloc(sizeof(struct subprocess_info),  GFP_ATOMIC);
+	if (!sub_info)
+		goto out;
+
+	INIT_WORK(&sub_info->work, __call_usermodehelper);
+	sub_info->path = path;
+	sub_info->argv = argv;
+	sub_info->envp = envp;
+
+  out:
+	return sub_info;
+}
+EXPORT_SYMBOL(call_usermodehelper_setup);
+
+/**
+ * call_usermodehelper_setkeys - set the session keys for usermode helper
+ * @info: a subprocess_info returned by call_usermodehelper_setup
+ * @session_keyring: the session keyring for the process
+ */
+void call_usermodehelper_setkeys(struct subprocess_info *info,
+				 struct key *session_keyring)
+{
+	info->ring = session_keyring;
+}
+EXPORT_SYMBOL(call_usermodehelper_setkeys);
+
+/**
+ * call_usermodehelper_setcleanup - set a cleanup function
+ * @info: a subprocess_info returned by call_usermodehelper_setup
+ * @cleanup: a cleanup function
+ *
+ * The cleanup function is just befor ethe subprocess_info is about to
+ * be freed.  This can be used for freeing the argv and envp.  The
+ * Function must be runnable in either a process context or the
+ * context in which call_usermodehelper_exec is called.
+ */
+void call_usermodehelper_setcleanup(struct subprocess_info *info,
+				    void (*cleanup)(char **argv, char **envp))
+{
+	info->cleanup = cleanup;
+}
+EXPORT_SYMBOL(call_usermodehelper_setcleanup);
+
+/**
+ * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
+ * @sub_info: a subprocess_info returned by call_usermodehelper_setup
+ * @filp: set to the write-end of a pipe
+ *
+ * This constructs a pipe, and sets the read end to be the stdin of the
+ * subprocess, and returns the write-end in *@filp.
+ */
+int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
+				  struct file **filp)
+{
+	struct file *f;
+
+	f = create_write_pipe();
+	if (IS_ERR(f))
+		return PTR_ERR(f);
+	*filp = f;
+
+	f = create_read_pipe(f);
+	if (IS_ERR(f)) {
+		free_write_pipe(*filp);
+		return PTR_ERR(f);
+	}
+	sub_info->stdin = f;
+
+	return 0;
+}
+EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
+
+/**
+ * call_usermodehelper_exec - start a usermode application
+ * @sub_info: information about the subprocessa
  * @wait: wait for the application to finish and return status.
  *        when -1 don't wait at all, but you get no useful error back when
  *        the program couldn't be exec'ed. This makes it safe to call
@@ -265,33 +357,24 @@ static void __call_usermodehelper(struct work_struct *work)
  * Runs a user-space application.  The application is started
  * asynchronously if wait is not set, and runs as a child of keventd.
  * (ie. it runs with full root capabilities).
- *
- * Must be called from process context.  Returns a negative error code
- * if program was not execed successfully, or 0.
  */
-int call_usermodehelper_keys(char *path, char **argv, char **envp,
-			     struct key *session_keyring, int wait)
+int call_usermodehelper_exec(struct subprocess_info *sub_info,
+			     int wait)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
-	struct subprocess_info *sub_info;
 	int retval;
 
-	if (!khelper_wq)
-		return -EBUSY;
-
-	if (path[0] == '\0')
-		return 0;
+	if (sub_info->path[0] == '\0') {
+		retval = 0;
+		goto out;
+	}
 
-	sub_info = kzalloc(sizeof(struct subprocess_info),  GFP_ATOMIC);
-	if (!sub_info)
-		return -ENOMEM;
+	if (!khelper_wq) {
+		retval = -EBUSY;
+		goto out;
+	}
 
-	INIT_WORK(&sub_info->work, __call_usermodehelper);
 	sub_info->complete = &done;
-	sub_info->path = path;
-	sub_info->argv = argv;
-	sub_info->envp = envp;
-	sub_info->ring = session_keyring;
 	sub_info->wait = wait;
 
 	queue_work(khelper_wq, &sub_info->work);
@@ -299,47 +382,43 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
 		return 0;
 	wait_for_completion(&done);
 	retval = sub_info->retval;
-	kfree(sub_info);
+
+  out:
+	call_usermodehelper_freeinfo(sub_info);
 	return retval;
 }
-EXPORT_SYMBOL(call_usermodehelper_keys);
+EXPORT_SYMBOL(call_usermodehelper_exec);
 
+/**
+ * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @filp: set to the write-end of a pipe
+ *
+ * This is a simple wrapper which executes a usermode-helper function
+ * with a pipe as stdin.  It is implemented entirely in terms of
+ * lower-level call_usermodehelper_* functions.
+ */
 int call_usermodehelper_pipe(char *path, char **argv, char **envp,
 			     struct file **filp)
 {
-	DECLARE_COMPLETION(done);
-	struct subprocess_info sub_info = {
-		.work		= __WORK_INITIALIZER(sub_info.work,
-						     __call_usermodehelper),
-		.complete	= &done,
-		.path		= path,
-		.argv		= argv,
-		.envp		= envp,
-		.retval		= 0,
-	};
-	struct file *f;
-
-	if (!khelper_wq)
-		return -EBUSY;
+	struct subprocess_info *sub_info;
+	int ret;
 
-	if (path[0] == '\0')
-		return 0;
+	sub_info = call_usermodehelper_setup(path, argv, envp);
+	if (sub_info == NULL)
+		return -ENOMEM;
 
-	f = create_write_pipe();
-	if (IS_ERR(f))
-		return PTR_ERR(f);
-	*filp = f;
+	ret = call_usermodehelper_stdinpipe(sub_info, filp);
+	if (ret < 0)
+		goto out;
 
-	f = create_read_pipe(f);
-	if (IS_ERR(f)) {
-		free_write_pipe(*filp);
-		return PTR_ERR(f);
-	}
-	sub_info.stdin = f;
+	return call_usermodehelper_exec(sub_info, 1);
 
-	queue_work(khelper_wq, &sub_info.work);
-	wait_for_completion(&done);
-	return sub_info.retval;
+  out:
+	call_usermodehelper_freeinfo(sub_info);
+	return ret;
 }
 EXPORT_SYMBOL(call_usermodehelper_pipe);
 
-- 
cgit v1.3-8-gc7d7


From 10a0a8d4e3f6bf2d077f94344441909abe670f5a Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:02 -0700
Subject: Add common orderly_poweroff()

Various pieces of code around the kernel want to be able to trigger an
orderly poweroff.  This pulls them together into a single
implementation.

By default the poweroff command is /sbin/poweroff, but it can be set
via sysctl: kernel/poweroff_cmd.  This is split at whitespace, so it
can include command-line arguments.

This patch replaces four other instances of invoking either "poweroff"
or "shutdown -h now": two sbus drivers, and acpi thermal
management.

sparc64 has its own "powerd"; still need to determine whether it should
be replaced by orderly_poweroff().

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Acked-by: Len Brown <lenb@kernel.org>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: David S. Miller <davem@davemloft.net>
---
 drivers/acpi/thermal.c          | 24 ++---------------
 drivers/sbus/char/bbc_envctrl.c |  5 ++--
 drivers/sbus/char/envctrl.c     |  7 ++---
 include/linux/reboot.h          |  5 ++++
 kernel/sys.c                    | 58 +++++++++++++++++++++++++++++++++++++++++
 kernel/sysctl.c                 | 10 +++++++
 6 files changed, 79 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 88a6fc7fd271..58f1338981bc 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -40,6 +40,7 @@
 #include <linux/jiffies.h>
 #include <linux/kmod.h>
 #include <linux/seq_file.h>
+#include <linux/reboot.h>
 #include <asm/uaccess.h>
 
 #include <acpi/acpi_bus.h>
@@ -59,7 +60,6 @@
 #define ACPI_THERMAL_NOTIFY_CRITICAL	0xF0
 #define ACPI_THERMAL_NOTIFY_HOT		0xF1
 #define ACPI_THERMAL_MODE_ACTIVE	0x00
-#define ACPI_THERMAL_PATH_POWEROFF	"/sbin/poweroff"
 
 #define ACPI_THERMAL_MAX_ACTIVE	10
 #define ACPI_THERMAL_MAX_LIMIT_STR_LEN 65
@@ -419,26 +419,6 @@ static int acpi_thermal_get_devices(struct acpi_thermal *tz)
 	return 0;
 }
 
-static int acpi_thermal_call_usermode(char *path)
-{
-	char *argv[2] = { NULL, NULL };
-	char *envp[3] = { NULL, NULL, NULL };
-
-
-	if (!path)
-		return -EINVAL;
-
-	argv[0] = path;
-
-	/* minimal command environment */
-	envp[0] = "HOME=/";
-	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-
-	call_usermodehelper(argv[0], argv, envp, 0);
-
-	return 0;
-}
-
 static int acpi_thermal_critical(struct acpi_thermal *tz)
 {
 	if (!tz || !tz->trips.critical.flags.valid)
@@ -456,7 +436,7 @@ static int acpi_thermal_critical(struct acpi_thermal *tz)
 	acpi_bus_generate_event(tz->device, ACPI_THERMAL_NOTIFY_CRITICAL,
 				tz->trips.critical.flags.enabled);
 
-	acpi_thermal_call_usermode(ACPI_THERMAL_PATH_POWEROFF);
+	orderly_poweroff(true);
 
 	return 0;
 }
diff --git a/drivers/sbus/char/bbc_envctrl.c b/drivers/sbus/char/bbc_envctrl.c
index a54e4140683a..e821a155b658 100644
--- a/drivers/sbus/char/bbc_envctrl.c
+++ b/drivers/sbus/char/bbc_envctrl.c
@@ -7,6 +7,7 @@
 #include <linux/kthread.h>
 #include <linux/delay.h>
 #include <linux/kmod.h>
+#include <linux/reboot.h>
 #include <asm/oplib.h>
 #include <asm/ebus.h>
 
@@ -170,8 +171,6 @@ static void get_current_temps(struct bbc_cpu_temperature *tp)
 static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp)
 {
 	static int shutting_down = 0;
-	static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
-	char *argv[] = { "/sbin/shutdown", "-h", "now", NULL };
 	char *type = "???";
 	s8 val = -1;
 
@@ -195,7 +194,7 @@ static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp)
 	printk(KERN_CRIT "kenvctrld: Shutting down the system now.\n");
 
 	shutting_down = 1;
-	if (call_usermodehelper("/sbin/shutdown", argv, envp, 0) < 0)
+	if (orderly_poweroff(true) < 0)
 		printk(KERN_CRIT "envctrl: shutdown execution failed\n");
 }
 
diff --git a/drivers/sbus/char/envctrl.c b/drivers/sbus/char/envctrl.c
index 8328acab47fd..dadabef116b6 100644
--- a/drivers/sbus/char/envctrl.c
+++ b/drivers/sbus/char/envctrl.c
@@ -26,6 +26,7 @@
 #include <linux/ioport.h>
 #include <linux/miscdevice.h>
 #include <linux/kmod.h>
+#include <linux/reboot.h>
 
 #include <asm/ebus.h>
 #include <asm/uaccess.h>
@@ -966,10 +967,6 @@ static struct i2c_child_t *envctrl_get_i2c_child(unsigned char mon_type)
 static void envctrl_do_shutdown(void)
 {
 	static int inprog = 0;
-	static char *envp[] = {	
-		"HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
-	char *argv[] = { 
-		"/sbin/shutdown", "-h", "now", NULL };	
 	int ret;
 
 	if (inprog != 0)
@@ -977,7 +974,7 @@ static void envctrl_do_shutdown(void)
 
 	inprog = 1;
 	printk(KERN_CRIT "kenvctrld: WARNING: Shutting down the system now.\n");
-	ret = call_usermodehelper("/sbin/shutdown", argv, envp, 0);
+	ret = orderly_poweroff(true);
 	if (ret < 0) {
 		printk(KERN_CRIT "kenvctrld: WARNING: system shutdown failed!\n"); 
 		inprog = 0;  /* unlikely to succeed, but we could try again */
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 1dd1c707311f..85ea63f462af 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -67,6 +67,11 @@ extern void kernel_power_off(void);
 
 void ctrl_alt_del(void);
 
+#define POWEROFF_CMD_PATH_LEN	256
+extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN];
+
+extern int orderly_poweroff(bool force);
+
 /*
  * Emergency restart, callable from an interrupt handler.
  */
diff --git a/kernel/sys.c b/kernel/sys.c
index 4d141ae3e802..aeded9ad66ce 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2286,3 +2286,61 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
 	}
 	return err ? -EFAULT : 0;
 }
+
+char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+
+static void argv_cleanup(char **argv, char **envp)
+{
+	argv_free(argv);
+}
+
+/**
+ * orderly_poweroff - Trigger an orderly system poweroff
+ * @force: force poweroff if command execution fails
+ *
+ * This may be called from any context to trigger a system shutdown.
+ * If the orderly shutdown fails, it will force an immediate shutdown.
+ */
+int orderly_poweroff(bool force)
+{
+	int argc;
+	char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
+	static char *envp[] = {
+		"HOME=/",
+		"PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+		NULL
+	};
+	int ret = -ENOMEM;
+	struct subprocess_info *info;
+
+	if (argv == NULL) {
+		printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
+		       __func__, poweroff_cmd);
+		goto out;
+	}
+
+	info = call_usermodehelper_setup(argv[0], argv, envp);
+	if (info == NULL) {
+		argv_free(argv);
+		goto out;
+	}
+
+	call_usermodehelper_setcleanup(info, argv_cleanup);
+
+	ret = call_usermodehelper_exec(info, -1);
+
+  out:
+	if (ret && force) {
+		printk(KERN_WARNING "Failed to start orderly shutdown: "
+		       "forcing the issue\n");
+
+		/* I guess this should try to kick off some daemon to
+		   sync and poweroff asap.  Or not even bother syncing
+		   if we're doing an emergency shutdown? */
+		emergency_sync();
+		kernel_power_off();
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(orderly_poweroff);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7063ebc6db05..44a1d699aad7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -46,6 +46,7 @@
 #include <linux/syscalls.h>
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
+#include <linux/reboot.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -705,6 +706,15 @@ static ctl_table kern_table[] = {
 		.proc_handler   = &proc_dointvec,
 	},
 #endif
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "poweroff_cmd",
+		.data		= &poweroff_cmd,
+		.maxlen		= POWEROFF_CMD_PATH_LEN,
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+		.strategy	= &sysctl_string,
+	},
 
 	{ .ctl_name = 0 }
 };
-- 
cgit v1.3-8-gc7d7


From 86313c488a6848b7ec2ba04e74f25f79dd32a0b7 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:03 -0700
Subject: usermodehelper: Tidy up waiting

Rather than using a tri-state integer for the wait flag in
call_usermodehelper_exec, define a proper enum, and use that.  I've
preserved the integer values so that any callers I've missed should
still work OK.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Andi Kleen <ak@suse.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Joel Becker <joel.becker@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: David Howells <dhowells@redhat.com>
---
 arch/i386/mach-voyager/voyager_thread.c |  2 +-
 arch/x86_64/kernel/mce.c                |  2 +-
 drivers/macintosh/therm_pm72.c          |  3 ++-
 drivers/macintosh/windfarm_core.c       |  3 ++-
 drivers/net/hamradio/baycom_epp.c       |  2 +-
 drivers/pnp/pnpbios/core.c              |  2 +-
 fs/ocfs2/heartbeat.c                    |  2 +-
 include/linux/kmod.h                    | 12 +++++++++---
 kernel/cpuset.c                         |  2 +-
 kernel/kmod.c                           | 27 ++++++++++++++++-----------
 kernel/sys.c                            |  2 +-
 lib/kobject_uevent.c                    |  2 +-
 net/bridge/br_stp_if.c                  |  2 +-
 security/keys/request_key.c             |  3 ++-
 14 files changed, 40 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/mach-voyager/voyager_thread.c b/arch/i386/mach-voyager/voyager_thread.c
index b4b24e0e45e1..f9d595338159 100644
--- a/arch/i386/mach-voyager/voyager_thread.c
+++ b/arch/i386/mach-voyager/voyager_thread.c
@@ -52,7 +52,7 @@ execute(const char *string)
 		NULL,
 	};
 
-	if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) {
+	if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
 		printk(KERN_ERR "Voyager failed to run \"%s\": %i\n",
 		       string, ret);
 	}
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index aa1d15991794..f3fb8174559e 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -174,7 +174,7 @@ static void do_mce_trigger(void)
 	if (events != atomic_read(&mce_logged) && trigger[0]) {
 		/* Small race window, but should be harmless.  */
 		atomic_set(&mce_logged, events);
-		call_usermodehelper(trigger, trigger_argv, NULL, -1);
+		call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 	}
 }
 
diff --git a/drivers/macintosh/therm_pm72.c b/drivers/macintosh/therm_pm72.c
index dbb22403979f..3d90fc002097 100644
--- a/drivers/macintosh/therm_pm72.c
+++ b/drivers/macintosh/therm_pm72.c
@@ -1770,7 +1770,8 @@ static int call_critical_overtemp(void)
 				"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
 				NULL };
 
-	return call_usermodehelper(critical_overtemp_path, argv, envp, 0);
+	return call_usermodehelper(critical_overtemp_path,
+				   argv, envp, UMH_WAIT_EXEC);
 }
 
 
diff --git a/drivers/macintosh/windfarm_core.c b/drivers/macintosh/windfarm_core.c
index e18d265d5d33..516d943227e2 100644
--- a/drivers/macintosh/windfarm_core.c
+++ b/drivers/macintosh/windfarm_core.c
@@ -80,7 +80,8 @@ int wf_critical_overtemp(void)
 				"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
 				NULL };
 
-	return call_usermodehelper(critical_overtemp_path, argv, envp, 0);
+	return call_usermodehelper(critical_overtemp_path,
+				   argv, envp, UMH_WAIT_EXEC);
 }
 EXPORT_SYMBOL_GPL(wf_critical_overtemp);
 
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c
index 84aa2117c0ee..355c6cf3d112 100644
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -320,7 +320,7 @@ static int eppconfig(struct baycom_state *bc)
 	sprintf(portarg, "%ld", bc->pdev->port->base);
 	printk(KERN_DEBUG "%s: %s -s -p %s -m %s\n", bc_drvname, eppconfig_path, portarg, modearg);
 
-	return call_usermodehelper(eppconfig_path, argv, envp, 1);
+	return call_usermodehelper(eppconfig_path, argv, envp, UMH_WAIT_PROC);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c
index 03baf1c64a2e..ed112ee16012 100644
--- a/drivers/pnp/pnpbios/core.c
+++ b/drivers/pnp/pnpbios/core.c
@@ -147,7 +147,7 @@ static int pnp_dock_event(int dock, struct pnp_docking_station_info *info)
 		info->location_id, info->serial, info->capabilities);
 	envp[i] = NULL;
 	
-	value = call_usermodehelper (argv [0], argv, envp, 0);
+	value = call_usermodehelper (argv [0], argv, envp, UMH_WAIT_EXEC);
 	kfree (buf);
 	kfree (envp);
 	return 0;
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 352eb4a13f98..c4c36171240d 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -209,7 +209,7 @@ void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
 	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
 	envp[2] = NULL;
 
-	ret = call_usermodehelper(argv[0], argv, envp, 1);
+	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 	if (ret < 0)
 		mlog_errno(ret);
 }
diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index c4cbe59d9c67..5dc13848891b 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -51,15 +51,21 @@ int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
 void call_usermodehelper_setcleanup(struct subprocess_info *info,
 				    void (*cleanup)(char **argv, char **envp));
 
+enum umh_wait {
+	UMH_NO_WAIT = -1,	/* don't wait at all */
+	UMH_WAIT_EXEC = 0,	/* wait for the exec, but not the process */
+	UMH_WAIT_PROC = 1,	/* wait for the process to complete */
+};
+
 /* Actually execute the sub-process */
-int call_usermodehelper_exec(struct subprocess_info *info, int wait);
+int call_usermodehelper_exec(struct subprocess_info *info, enum umh_wait wait);
 
 /* Free the subprocess_info. This is only needed if you're not going
    to call call_usermodehelper_exec */
 void call_usermodehelper_freeinfo(struct subprocess_info *info);
 
 static inline int
-call_usermodehelper(char *path, char **argv, char **envp, int wait)
+call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait)
 {
 	struct subprocess_info *info;
 
@@ -71,7 +77,7 @@ call_usermodehelper(char *path, char **argv, char **envp, int wait)
 
 static inline int
 call_usermodehelper_keys(char *path, char **argv, char **envp,
-			 struct key *session_keyring, int wait)
+			 struct key *session_keyring, enum umh_wait wait)
 {
 	struct subprocess_info *info;
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b4796d850140..57e6448b171e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -516,7 +516,7 @@ static void cpuset_release_agent(const char *pathbuf)
 	envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
 	envp[i] = NULL;
 
-	call_usermodehelper(argv[0], argv, envp, 0);
+	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
 	kfree(pathbuf);
 }
 
diff --git a/kernel/kmod.c b/kernel/kmod.c
index d2dce71115d8..78d365c524ed 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -119,7 +119,7 @@ struct subprocess_info {
 	char **argv;
 	char **envp;
 	struct key *ring;
-	int wait;
+	enum umh_wait wait;
 	int retval;
 	struct file *stdin;
 	void (*cleanup)(char **argv, char **envp);
@@ -225,7 +225,7 @@ static int wait_for_helper(void *data)
 			sub_info->retval = ret;
 	}
 
-	if (sub_info->wait < 0)
+	if (sub_info->wait == UMH_NO_WAIT)
 		call_usermodehelper_freeinfo(sub_info);
 	else
 		complete(sub_info->complete);
@@ -238,26 +238,31 @@ static void __call_usermodehelper(struct work_struct *work)
 	struct subprocess_info *sub_info =
 		container_of(work, struct subprocess_info, work);
 	pid_t pid;
-	int wait = sub_info->wait;
+	enum umh_wait wait = sub_info->wait;
 
 	/* CLONE_VFORK: wait until the usermode helper has execve'd
 	 * successfully We need the data structures to stay around
 	 * until that is done.  */
-	if (wait)
+	if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT)
 		pid = kernel_thread(wait_for_helper, sub_info,
 				    CLONE_FS | CLONE_FILES | SIGCHLD);
 	else
 		pid = kernel_thread(____call_usermodehelper, sub_info,
 				    CLONE_VFORK | SIGCHLD);
 
-	if (wait < 0)
-		return;
+	switch (wait) {
+	case UMH_NO_WAIT:
+		break;
 
-	if (pid < 0) {
+	case UMH_WAIT_PROC:
+		if (pid > 0)
+			break;
 		sub_info->retval = pid;
+		/* FALLTHROUGH */
+
+	case UMH_WAIT_EXEC:
 		complete(sub_info->complete);
-	} else if (!wait)
-		complete(sub_info->complete);
+	}
 }
 
 /**
@@ -359,7 +364,7 @@ EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
  * (ie. it runs with full root capabilities).
  */
 int call_usermodehelper_exec(struct subprocess_info *sub_info,
-			     int wait)
+			     enum umh_wait wait)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	int retval;
@@ -378,7 +383,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
 	sub_info->wait = wait;
 
 	queue_work(khelper_wq, &sub_info->work);
-	if (wait < 0) /* task has freed sub_info */
+	if (wait == UMH_NO_WAIT) /* task has freed sub_info */
 		return 0;
 	wait_for_completion(&done);
 	retval = sub_info->retval;
diff --git a/kernel/sys.c b/kernel/sys.c
index aeded9ad66ce..18987c7f6add 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2327,7 +2327,7 @@ int orderly_poweroff(bool force)
 
 	call_usermodehelper_setcleanup(info, argv_cleanup);
 
-	ret = call_usermodehelper_exec(info, -1);
+	ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
 
   out:
 	if (ret && force) {
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 12e311dc664c..bd5ecbbafab1 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -208,7 +208,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 		argv [0] = uevent_helper;
 		argv [1] = (char *)subsystem;
 		argv [2] = NULL;
-		call_usermodehelper (argv[0], argv, envp, 0);
+		call_usermodehelper (argv[0], argv, envp, UMH_WAIT_EXEC);
 	}
 
 exit:
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index a786e7863200..1ea2f86f7683 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -125,7 +125,7 @@ static void br_stp_start(struct net_bridge *br)
 	char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL };
 	char *envp[] = { NULL };
 
-	r = call_usermodehelper(BR_STP_PROG, argv, envp, 1);
+	r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
 	if (r == 0) {
 		br->stp_enabled = BR_USER_STP;
 		printk(KERN_INFO "%s: userspace STP started\n", br->dev->name);
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index f573ac189a0a..557500110a13 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -108,7 +108,8 @@ static int call_sbin_request_key(struct key *key,
 	argv[i] = NULL;
 
 	/* do it */
-	ret = call_usermodehelper_keys(argv[0], argv, envp, keyring, 1);
+	ret = call_usermodehelper_keys(argv[0], argv, envp, keyring,
+				       UMH_WAIT_PROC);
 
 error_link:
 	key_put(keyring);
-- 
cgit v1.3-8-gc7d7


From 810bab448e563ffd1718d78e9a3756806b626acc Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:03 -0700
Subject: use elfnote.h to generate vsyscall notes.

Use existing elfnote.h to generate vsyscall notes, rather than doing
it locally.  Changes elfnote.h a bit to suit, since this is the first
asm user, and it wasn't quite right.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.com>
---
 arch/i386/kernel/vsyscall-note.S | 23 ++++++-----------------
 include/linux/elfnote.h          | 22 +++++++++++++++-------
 2 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/vsyscall-note.S b/arch/i386/kernel/vsyscall-note.S
index d4b5be4f3d5f..52e0cbbac70c 100644
--- a/arch/i386/kernel/vsyscall-note.S
+++ b/arch/i386/kernel/vsyscall-note.S
@@ -3,23 +3,12 @@
  * Here we can supply some information useful to userland.
  */
 
-#include <linux/uts.h>
 #include <linux/version.h>
+#include <linux/elfnote.h>
 
-#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type)			      \
-	.section name, flags;						      \
-	.balign 4;							      \
-	.long 1f - 0f;		/* name length */			      \
-	.long 3f - 2f;		/* data length */			      \
-	.long type;		/* note type */				      \
-0:	.asciz vendor;		/* vendor name */			      \
-1:	.balign 4;							      \
-2:
-
-#define ASM_ELF_NOTE_END						      \
-3:	.balign 4;		/* pad out section */			      \
-	.previous
-
-	ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0)
+/* Ideally this would use UTS_NAME, but using a quoted string here
+   doesn't work. Remember to change this when changing the
+   kernel's name. */
+ELFNOTE_START(Linux, 0, "a")
 	.long LINUX_VERSION_CODE
-	ASM_ELF_NOTE_END
+ELFNOTE_END
diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h
index 9a1e0674e56c..e831759b2fb5 100644
--- a/include/linux/elfnote.h
+++ b/include/linux/elfnote.h
@@ -38,17 +38,25 @@
  * e.g. ELFNOTE(XYZCo, 42, .asciz, "forty-two")
  *      ELFNOTE(XYZCo, 12, .long, 0xdeadbeef)
  */
-#define ELFNOTE(name, type, desctype, descdata)	\
-.pushsection .note.name, "",@note	;	\
-  .align 4				;	\
+#define ELFNOTE_START(name, type, flags)	\
+.pushsection .note.name, flags,@note	;	\
+  .balign 4				;	\
   .long 2f - 1f		/* namesz */	;	\
-  .long 4f - 3f		/* descsz */	;	\
+  .long 4484f - 3f	/* descsz */	;	\
   .long type				;	\
 1:.asciz #name				;	\
-2:.align 4				;	\
-3:desctype descdata			;	\
-4:.align 4				;	\
+2:.balign 4				;	\
+3:
+
+#define ELFNOTE_END				\
+4484:.balign 4				;	\
 .popsection				;
+
+#define ELFNOTE(name, type, desc)		\
+	ELFNOTE_START(name, type, "")		\
+		desc			;	\
+	ELFNOTE_END
+
 #else	/* !__ASSEMBLER__ */
 #include <linux/elf.h>
 /*
-- 
cgit v1.3-8-gc7d7


From 5f4352fbffd6c45123dbce9e195efd54df4e177e Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:04 -0700
Subject: Allocate and free vmalloc areas

Allocate/release a chunk of vmalloc address space:
 alloc_vm_area reserves a chunk of address space, and makes sure all
 the pagetables are constructed for that address range - but no pages.

 free_vm_area releases the address space range.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: "Jan Beulich" <JBeulich@novell.com>
Cc: "Andi Kleen" <ak@muc.de>
---
 include/linux/vmalloc.h |  4 ++++
 mm/vmalloc.c            | 53 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 132b260aef1e..c2b10cae5da5 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -70,6 +70,10 @@ extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
 			struct page ***pages);
 extern void unmap_kernel_range(unsigned long addr, unsigned long size);
 
+/* Allocate/destroy a 'vmalloc' VM area. */
+extern struct vm_struct *alloc_vm_area(size_t size);
+extern void free_vm_area(struct vm_struct *area);
+
 /*
  *	Internals.  Dont't use..
  */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8e05a11155c9..3130c343088f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -767,3 +767,56 @@ EXPORT_SYMBOL(remap_vmalloc_range);
 void  __attribute__((weak)) vmalloc_sync_all(void)
 {
 }
+
+
+static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+{
+	/* apply_to_page_range() does all the hard work. */
+	return 0;
+}
+
+/**
+ *	alloc_vm_area - allocate a range of kernel address space
+ *	@size:		size of the area
+ *	@returns:	NULL on failure, vm_struct on success
+ *
+ *	This function reserves a range of kernel address space, and
+ *	allocates pagetables to map that range.  No actual mappings
+ *	are created.  If the kernel address space is not shared
+ *	between processes, it syncs the pagetable across all
+ *	processes.
+ */
+struct vm_struct *alloc_vm_area(size_t size)
+{
+	struct vm_struct *area;
+
+	area = get_vm_area(size, VM_IOREMAP);
+	if (area == NULL)
+		return NULL;
+
+	/*
+	 * This ensures that page tables are constructed for this region
+	 * of kernel virtual address space and mapped into init_mm.
+	 */
+	if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
+				area->size, f, NULL)) {
+		free_vm_area(area);
+		return NULL;
+	}
+
+	/* Make sure the pagetables are constructed in process kernel
+	   mappings */
+	vmalloc_sync_all();
+
+	return area;
+}
+EXPORT_SYMBOL_GPL(alloc_vm_area);
+
+void free_vm_area(struct vm_struct *area)
+{
+	struct vm_struct *ret;
+	ret = remove_vm_area(area->addr);
+	BUG_ON(ret != area);
+	kfree(area);
+}
+EXPORT_SYMBOL_GPL(free_vm_area);
-- 
cgit v1.3-8-gc7d7


From c85b04c3749507546f6d5868976e4793e35c2ec0 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:05 -0700
Subject: xen: add pinned page flag

Add a new definition for PG_owner_priv_1 to define PG_pinned on Xen
pagetable pages.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 include/linux/page-flags.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index ae2d79f2107e..731cd2ac3227 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -92,6 +92,7 @@
 
 /* PG_owner_priv_1 users should have descriptive aliases */
 #define PG_checked		PG_owner_priv_1 /* Used by some filesystems */
+#define PG_pinned		PG_owner_priv_1	/* Xen pinned pagetable */
 
 #if (BITS_PER_LONG > 32)
 /*
@@ -170,6 +171,10 @@ static inline void SetPageUptodate(struct page *page)
 #define SetPageChecked(page)	set_bit(PG_checked, &(page)->flags)
 #define ClearPageChecked(page)	clear_bit(PG_checked, &(page)->flags)
 
+#define PagePinned(page)	test_bit(PG_pinned, &(page)->flags)
+#define SetPagePinned(page)	set_bit(PG_pinned, &(page)->flags)
+#define ClearPagePinned(page)	clear_bit(PG_pinned, &(page)->flags)
+
 #define PageReserved(page)	test_bit(PG_reserved, &(page)->flags)
 #define SetPageReserved(page)	set_bit(PG_reserved, &(page)->flags)
 #define ClearPageReserved(page)	clear_bit(PG_reserved, &(page)->flags)
-- 
cgit v1.3-8-gc7d7


From 9f27ee595038653ddf8bca871200d39247d6f4fc Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:06 -0700
Subject: xen: add virtual block device driver.

The block device frontend driver allows the kernel to access block
devices exported exported by a virtual machine containing a physical
block device driver.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Greg KH <greg@kroah.com>
Cc: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/Kconfig        |   9 +
 drivers/block/Makefile       |   1 +
 drivers/block/xen-blkfront.c | 988 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/major.h        |   2 +
 4 files changed, 1000 insertions(+)
 create mode 100644 drivers/block/xen-blkfront.c

(limited to 'include/linux')

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 8f65b88cf711..a4a311992408 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -427,4 +427,13 @@ config XILINX_SYSACE
 	help
 	  Include support for the Xilinx SystemACE CompactFlash interface
 
+config XEN_BLKDEV_FRONTEND
+	tristate "Xen virtual block device support"
+	depends on XEN
+	default y
+	help
+	  This driver implements the front-end of the Xen virtual
+	  block device driver.  It communicates with a back-end driver
+	  in another domain which drives the actual block device.
+
 endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 9ee08ab4ffa8..3e31532df0ed 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_VIODASD)		+= viodasd.o
 obj-$(CONFIG_BLK_DEV_SX8)	+= sx8.o
 obj-$(CONFIG_BLK_DEV_UB)	+= ub.o
 
+obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= xen-blkfront.o
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
new file mode 100644
index 000000000000..6746c29181f8
--- /dev/null
+++ b/drivers/block/xen-blkfront.c
@@ -0,0 +1,988 @@
+/*
+ * blkfront.c
+ *
+ * XenLinux virtual block device driver.
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004, Christian Limpach
+ * Copyright (c) 2004, Andrew Warfield
+ * Copyright (c) 2005, Christopher Clark
+ * Copyright (c) 2005, XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/blkdev.h>
+#include <linux/module.h>
+
+#include <xen/xenbus.h>
+#include <xen/grant_table.h>
+#include <xen/events.h>
+#include <xen/page.h>
+
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/blkif.h>
+
+#include <asm/xen/hypervisor.h>
+
+enum blkif_state {
+	BLKIF_STATE_DISCONNECTED,
+	BLKIF_STATE_CONNECTED,
+	BLKIF_STATE_SUSPENDED,
+};
+
+struct blk_shadow {
+	struct blkif_request req;
+	unsigned long request;
+	unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+
+static struct block_device_operations xlvbd_block_fops;
+
+#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
+
+/*
+ * We have one of these per vbd, whether ide, scsi or 'other'.  They
+ * hang in private_data off the gendisk structure. We may end up
+ * putting all kinds of interesting stuff here :-)
+ */
+struct blkfront_info
+{
+	struct xenbus_device *xbdev;
+	dev_t dev;
+	struct gendisk *gd;
+	int vdevice;
+	blkif_vdev_t handle;
+	enum blkif_state connected;
+	int ring_ref;
+	struct blkif_front_ring ring;
+	unsigned int evtchn, irq;
+	struct request_queue *rq;
+	struct work_struct work;
+	struct gnttab_free_callback callback;
+	struct blk_shadow shadow[BLK_RING_SIZE];
+	unsigned long shadow_free;
+	int feature_barrier;
+
+	/**
+	 * The number of people holding this device open.  We won't allow a
+	 * hot-unplug unless this is 0.
+	 */
+	int users;
+};
+
+static DEFINE_SPINLOCK(blkif_io_lock);
+
+#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
+	(BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
+#define GRANT_INVALID_REF	0
+
+#define PARTS_PER_DISK		16
+
+#define BLKIF_MAJOR(dev) ((dev)>>8)
+#define BLKIF_MINOR(dev) ((dev) & 0xff)
+
+#define DEV_NAME	"xvd"	/* name in /dev */
+
+/* Information about our VBDs. */
+#define MAX_VBDS 64
+static LIST_HEAD(vbds_list);
+
+static int get_id_from_freelist(struct blkfront_info *info)
+{
+	unsigned long free = info->shadow_free;
+	BUG_ON(free > BLK_RING_SIZE);
+	info->shadow_free = info->shadow[free].req.id;
+	info->shadow[free].req.id = 0x0fffffee; /* debug */
+	return free;
+}
+
+static void add_id_to_freelist(struct blkfront_info *info,
+			       unsigned long id)
+{
+	info->shadow[id].req.id  = info->shadow_free;
+	info->shadow[id].request = 0;
+	info->shadow_free = id;
+}
+
+static void blkif_restart_queue_callback(void *arg)
+{
+	struct blkfront_info *info = (struct blkfront_info *)arg;
+	schedule_work(&info->work);
+}
+
+/*
+ * blkif_queue_request
+ *
+ * request block io
+ *
+ * id: for guest use only.
+ * operation: BLKIF_OP_{READ,WRITE,PROBE}
+ * buffer: buffer to read/write into. this should be a
+ *   virtual address in the guest os.
+ */
+static int blkif_queue_request(struct request *req)
+{
+	struct blkfront_info *info = req->rq_disk->private_data;
+	unsigned long buffer_mfn;
+	struct blkif_request *ring_req;
+	struct bio *bio;
+	struct bio_vec *bvec;
+	int idx;
+	unsigned long id;
+	unsigned int fsect, lsect;
+	int ref;
+	grant_ref_t gref_head;
+
+	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
+		return 1;
+
+	if (gnttab_alloc_grant_references(
+		BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
+		gnttab_request_free_callback(
+			&info->callback,
+			blkif_restart_queue_callback,
+			info,
+			BLKIF_MAX_SEGMENTS_PER_REQUEST);
+		return 1;
+	}
+
+	/* Fill out a communications ring structure. */
+	ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
+	id = get_id_from_freelist(info);
+	info->shadow[id].request = (unsigned long)req;
+
+	ring_req->id = id;
+	ring_req->sector_number = (blkif_sector_t)req->sector;
+	ring_req->handle = info->handle;
+
+	ring_req->operation = rq_data_dir(req) ?
+		BLKIF_OP_WRITE : BLKIF_OP_READ;
+	if (blk_barrier_rq(req))
+		ring_req->operation = BLKIF_OP_WRITE_BARRIER;
+
+	ring_req->nr_segments = 0;
+	rq_for_each_bio (bio, req) {
+		bio_for_each_segment (bvec, bio, idx) {
+			BUG_ON(ring_req->nr_segments
+			       == BLKIF_MAX_SEGMENTS_PER_REQUEST);
+			buffer_mfn = pfn_to_mfn(page_to_pfn(bvec->bv_page));
+			fsect = bvec->bv_offset >> 9;
+			lsect = fsect + (bvec->bv_len >> 9) - 1;
+			/* install a grant reference. */
+			ref = gnttab_claim_grant_reference(&gref_head);
+			BUG_ON(ref == -ENOSPC);
+
+			gnttab_grant_foreign_access_ref(
+				ref,
+				info->xbdev->otherend_id,
+				buffer_mfn,
+				rq_data_dir(req) );
+
+			info->shadow[id].frame[ring_req->nr_segments] =
+				mfn_to_pfn(buffer_mfn);
+
+			ring_req->seg[ring_req->nr_segments] =
+				(struct blkif_request_segment) {
+					.gref       = ref,
+					.first_sect = fsect,
+					.last_sect  = lsect };
+
+			ring_req->nr_segments++;
+		}
+	}
+
+	info->ring.req_prod_pvt++;
+
+	/* Keep a private copy so we can reissue requests when recovering. */
+	info->shadow[id].req = *ring_req;
+
+	gnttab_free_grant_references(gref_head);
+
+	return 0;
+}
+
+
+static inline void flush_requests(struct blkfront_info *info)
+{
+	int notify;
+
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
+
+	if (notify)
+		notify_remote_via_irq(info->irq);
+}
+
+/*
+ * do_blkif_request
+ *  read a block; request is in a request queue
+ */
+static void do_blkif_request(request_queue_t *rq)
+{
+	struct blkfront_info *info = NULL;
+	struct request *req;
+	int queued;
+
+	pr_debug("Entered do_blkif_request\n");
+
+	queued = 0;
+
+	while ((req = elv_next_request(rq)) != NULL) {
+		info = req->rq_disk->private_data;
+		if (!blk_fs_request(req)) {
+			end_request(req, 0);
+			continue;
+		}
+
+		if (RING_FULL(&info->ring))
+			goto wait;
+
+		pr_debug("do_blk_req %p: cmd %p, sec %lx, "
+			 "(%u/%li) buffer:%p [%s]\n",
+			 req, req->cmd, (unsigned long)req->sector,
+			 req->current_nr_sectors,
+			 req->nr_sectors, req->buffer,
+			 rq_data_dir(req) ? "write" : "read");
+
+
+		blkdev_dequeue_request(req);
+		if (blkif_queue_request(req)) {
+			blk_requeue_request(rq, req);
+wait:
+			/* Avoid pointless unplugs. */
+			blk_stop_queue(rq);
+			break;
+		}
+
+		queued++;
+	}
+
+	if (queued != 0)
+		flush_requests(info);
+}
+
+static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
+{
+	request_queue_t *rq;
+
+	rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
+	if (rq == NULL)
+		return -1;
+
+	elevator_init(rq, "noop");
+
+	/* Hard sector size and max sectors impersonate the equiv. hardware. */
+	blk_queue_hardsect_size(rq, sector_size);
+	blk_queue_max_sectors(rq, 512);
+
+	/* Each segment in a request is up to an aligned page in size. */
+	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
+	blk_queue_max_segment_size(rq, PAGE_SIZE);
+
+	/* Ensure a merged request will fit in a single I/O ring slot. */
+	blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+	blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+	/* Make sure buffer addresses are sector-aligned. */
+	blk_queue_dma_alignment(rq, 511);
+
+	gd->queue = rq;
+
+	return 0;
+}
+
+
+static int xlvbd_barrier(struct blkfront_info *info)
+{
+	int err;
+
+	err = blk_queue_ordered(info->rq,
+				info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE,
+				NULL);
+
+	if (err)
+		return err;
+
+	printk(KERN_INFO "blkfront: %s: barriers %s\n",
+	       info->gd->disk_name,
+	       info->feature_barrier ? "enabled" : "disabled");
+	return 0;
+}
+
+
+static int xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity,
+			       int vdevice, u16 vdisk_info, u16 sector_size,
+			       struct blkfront_info *info)
+{
+	struct gendisk *gd;
+	int nr_minors = 1;
+	int err = -ENODEV;
+
+	BUG_ON(info->gd != NULL);
+	BUG_ON(info->rq != NULL);
+
+	if ((minor % PARTS_PER_DISK) == 0)
+		nr_minors = PARTS_PER_DISK;
+
+	gd = alloc_disk(nr_minors);
+	if (gd == NULL)
+		goto out;
+
+	if (nr_minors > 1)
+		sprintf(gd->disk_name, "%s%c", DEV_NAME,
+			'a' + minor / PARTS_PER_DISK);
+	else
+		sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
+			'a' + minor / PARTS_PER_DISK,
+			minor % PARTS_PER_DISK);
+
+	gd->major = XENVBD_MAJOR;
+	gd->first_minor = minor;
+	gd->fops = &xlvbd_block_fops;
+	gd->private_data = info;
+	gd->driverfs_dev = &(info->xbdev->dev);
+	set_capacity(gd, capacity);
+
+	if (xlvbd_init_blk_queue(gd, sector_size)) {
+		del_gendisk(gd);
+		goto out;
+	}
+
+	info->rq = gd->queue;
+	info->gd = gd;
+
+	if (info->feature_barrier)
+		xlvbd_barrier(info);
+
+	if (vdisk_info & VDISK_READONLY)
+		set_disk_ro(gd, 1);
+
+	if (vdisk_info & VDISK_REMOVABLE)
+		gd->flags |= GENHD_FL_REMOVABLE;
+
+	if (vdisk_info & VDISK_CDROM)
+		gd->flags |= GENHD_FL_CD;
+
+	return 0;
+
+ out:
+	return err;
+}
+
+static void kick_pending_request_queues(struct blkfront_info *info)
+{
+	if (!RING_FULL(&info->ring)) {
+		/* Re-enable calldowns. */
+		blk_start_queue(info->rq);
+		/* Kick things off immediately. */
+		do_blkif_request(info->rq);
+	}
+}
+
+static void blkif_restart_queue(struct work_struct *work)
+{
+	struct blkfront_info *info = container_of(work, struct blkfront_info, work);
+
+	spin_lock_irq(&blkif_io_lock);
+	if (info->connected == BLKIF_STATE_CONNECTED)
+		kick_pending_request_queues(info);
+	spin_unlock_irq(&blkif_io_lock);
+}
+
+static void blkif_free(struct blkfront_info *info, int suspend)
+{
+	/* Prevent new requests being issued until we fix things up. */
+	spin_lock_irq(&blkif_io_lock);
+	info->connected = suspend ?
+		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
+	/* No more blkif_request(). */
+	if (info->rq)
+		blk_stop_queue(info->rq);
+	/* No more gnttab callback work. */
+	gnttab_cancel_free_callback(&info->callback);
+	spin_unlock_irq(&blkif_io_lock);
+
+	/* Flush gnttab callback work. Must be done with no locks held. */
+	flush_scheduled_work();
+
+	/* Free resources associated with old device channel. */
+	if (info->ring_ref != GRANT_INVALID_REF) {
+		gnttab_end_foreign_access(info->ring_ref, 0,
+					  (unsigned long)info->ring.sring);
+		info->ring_ref = GRANT_INVALID_REF;
+		info->ring.sring = NULL;
+	}
+	if (info->irq)
+		unbind_from_irqhandler(info->irq, info);
+	info->evtchn = info->irq = 0;
+
+}
+
+static void blkif_completion(struct blk_shadow *s)
+{
+	int i;
+	for (i = 0; i < s->req.nr_segments; i++)
+		gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
+}
+
+static irqreturn_t blkif_interrupt(int irq, void *dev_id)
+{
+	struct request *req;
+	struct blkif_response *bret;
+	RING_IDX i, rp;
+	unsigned long flags;
+	struct blkfront_info *info = (struct blkfront_info *)dev_id;
+	int uptodate;
+
+	spin_lock_irqsave(&blkif_io_lock, flags);
+
+	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
+		spin_unlock_irqrestore(&blkif_io_lock, flags);
+		return IRQ_HANDLED;
+	}
+
+ again:
+	rp = info->ring.sring->rsp_prod;
+	rmb(); /* Ensure we see queued responses up to 'rp'. */
+
+	for (i = info->ring.rsp_cons; i != rp; i++) {
+		unsigned long id;
+		int ret;
+
+		bret = RING_GET_RESPONSE(&info->ring, i);
+		id   = bret->id;
+		req  = (struct request *)info->shadow[id].request;
+
+		blkif_completion(&info->shadow[id]);
+
+		add_id_to_freelist(info, id);
+
+		uptodate = (bret->status == BLKIF_RSP_OKAY);
+		switch (bret->operation) {
+		case BLKIF_OP_WRITE_BARRIER:
+			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
+				printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
+				       info->gd->disk_name);
+				uptodate = -EOPNOTSUPP;
+				info->feature_barrier = 0;
+				xlvbd_barrier(info);
+			}
+			/* fall through */
+		case BLKIF_OP_READ:
+		case BLKIF_OP_WRITE:
+			if (unlikely(bret->status != BLKIF_RSP_OKAY))
+				dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
+					"request: %x\n", bret->status);
+
+			ret = end_that_request_first(req, uptodate,
+				req->hard_nr_sectors);
+			BUG_ON(ret);
+			end_that_request_last(req, uptodate);
+			break;
+		default:
+			BUG();
+		}
+	}
+
+	info->ring.rsp_cons = i;
+
+	if (i != info->ring.req_prod_pvt) {
+		int more_to_do;
+		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
+		if (more_to_do)
+			goto again;
+	} else
+		info->ring.sring->rsp_event = i + 1;
+
+	kick_pending_request_queues(info);
+
+	spin_unlock_irqrestore(&blkif_io_lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+
+static int setup_blkring(struct xenbus_device *dev,
+			 struct blkfront_info *info)
+{
+	struct blkif_sring *sring;
+	int err;
+
+	info->ring_ref = GRANT_INVALID_REF;
+
+	sring = (struct blkif_sring *)__get_free_page(GFP_KERNEL);
+	if (!sring) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
+		return -ENOMEM;
+	}
+	SHARED_RING_INIT(sring);
+	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
+
+	err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
+	if (err < 0) {
+		free_page((unsigned long)sring);
+		info->ring.sring = NULL;
+		goto fail;
+	}
+	info->ring_ref = err;
+
+	err = xenbus_alloc_evtchn(dev, &info->evtchn);
+	if (err)
+		goto fail;
+
+	err = bind_evtchn_to_irqhandler(info->evtchn,
+					blkif_interrupt,
+					IRQF_SAMPLE_RANDOM, "blkif", info);
+	if (err <= 0) {
+		xenbus_dev_fatal(dev, err,
+				 "bind_evtchn_to_irqhandler failed");
+		goto fail;
+	}
+	info->irq = err;
+
+	return 0;
+fail:
+	blkif_free(info, 0);
+	return err;
+}
+
+
+/* Common code used when first setting up, and when resuming. */
+static int talk_to_backend(struct xenbus_device *dev,
+			   struct blkfront_info *info)
+{
+	const char *message = NULL;
+	struct xenbus_transaction xbt;
+	int err;
+
+	/* Create shared ring, alloc event channel. */
+	err = setup_blkring(dev, info);
+	if (err)
+		goto out;
+
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		goto destroy_blkring;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename,
+			    "ring-ref", "%u", info->ring_ref);
+	if (err) {
+		message = "writing ring-ref";
+		goto abort_transaction;
+	}
+	err = xenbus_printf(xbt, dev->nodename,
+			    "event-channel", "%u", info->evtchn);
+	if (err) {
+		message = "writing event-channel";
+		goto abort_transaction;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err) {
+		if (err == -EAGAIN)
+			goto again;
+		xenbus_dev_fatal(dev, err, "completing transaction");
+		goto destroy_blkring;
+	}
+
+	xenbus_switch_state(dev, XenbusStateInitialised);
+
+	return 0;
+
+ abort_transaction:
+	xenbus_transaction_end(xbt, 1);
+	if (message)
+		xenbus_dev_fatal(dev, err, "%s", message);
+ destroy_blkring:
+	blkif_free(info, 0);
+ out:
+	return err;
+}
+
+
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures and the ring buffer for communication with the backend, and
+ * inform the backend of the appropriate details for those.  Switch to
+ * Initialised state.
+ */
+static int blkfront_probe(struct xenbus_device *dev,
+			  const struct xenbus_device_id *id)
+{
+	int err, vdevice, i;
+	struct blkfront_info *info;
+
+	/* FIXME: Use dynamic device id if this is not set. */
+	err = xenbus_scanf(XBT_NIL, dev->nodename,
+			   "virtual-device", "%i", &vdevice);
+	if (err != 1) {
+		xenbus_dev_fatal(dev, err, "reading virtual-device");
+		return err;
+	}
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
+		return -ENOMEM;
+	}
+
+	info->xbdev = dev;
+	info->vdevice = vdevice;
+	info->connected = BLKIF_STATE_DISCONNECTED;
+	INIT_WORK(&info->work, blkif_restart_queue);
+
+	for (i = 0; i < BLK_RING_SIZE; i++)
+		info->shadow[i].req.id = i+1;
+	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+
+	/* Front end dir is a number, which is used as the id. */
+	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
+	dev->dev.driver_data = info;
+
+	err = talk_to_backend(dev, info);
+	if (err) {
+		kfree(info);
+		dev->dev.driver_data = NULL;
+		return err;
+	}
+
+	return 0;
+}
+
+
+static int blkif_recover(struct blkfront_info *info)
+{
+	int i;
+	struct blkif_request *req;
+	struct blk_shadow *copy;
+	int j;
+
+	/* Stage 1: Make a safe copy of the shadow state. */
+	copy = kmalloc(sizeof(info->shadow), GFP_KERNEL);
+	if (!copy)
+		return -ENOMEM;
+	memcpy(copy, info->shadow, sizeof(info->shadow));
+
+	/* Stage 2: Set up free list. */
+	memset(&info->shadow, 0, sizeof(info->shadow));
+	for (i = 0; i < BLK_RING_SIZE; i++)
+		info->shadow[i].req.id = i+1;
+	info->shadow_free = info->ring.req_prod_pvt;
+	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+
+	/* Stage 3: Find pending requests and requeue them. */
+	for (i = 0; i < BLK_RING_SIZE; i++) {
+		/* Not in use? */
+		if (copy[i].request == 0)
+			continue;
+
+		/* Grab a request slot and copy shadow state into it. */
+		req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
+		*req = copy[i].req;
+
+		/* We get a new request id, and must reset the shadow state. */
+		req->id = get_id_from_freelist(info);
+		memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
+
+		/* Rewrite any grant references invalidated by susp/resume. */
+		for (j = 0; j < req->nr_segments; j++)
+			gnttab_grant_foreign_access_ref(
+				req->seg[j].gref,
+				info->xbdev->otherend_id,
+				pfn_to_mfn(info->shadow[req->id].frame[j]),
+				rq_data_dir(
+					(struct request *)
+					info->shadow[req->id].request));
+		info->shadow[req->id].req = *req;
+
+		info->ring.req_prod_pvt++;
+	}
+
+	kfree(copy);
+
+	xenbus_switch_state(info->xbdev, XenbusStateConnected);
+
+	spin_lock_irq(&blkif_io_lock);
+
+	/* Now safe for us to use the shared ring */
+	info->connected = BLKIF_STATE_CONNECTED;
+
+	/* Send off requeued requests */
+	flush_requests(info);
+
+	/* Kick any other new requests queued since we resumed */
+	kick_pending_request_queues(info);
+
+	spin_unlock_irq(&blkif_io_lock);
+
+	return 0;
+}
+
+/**
+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
+ * driver restart.  We tear down our blkif structure and recreate it, but
+ * leave the device-layer structures intact so that this is transparent to the
+ * rest of the kernel.
+ */
+static int blkfront_resume(struct xenbus_device *dev)
+{
+	struct blkfront_info *info = dev->dev.driver_data;
+	int err;
+
+	dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
+
+	blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
+
+	err = talk_to_backend(dev, info);
+	if (info->connected == BLKIF_STATE_SUSPENDED && !err)
+		err = blkif_recover(info);
+
+	return err;
+}
+
+
+/*
+ * Invoked when the backend is finally 'ready' (and has told produced
+ * the details about the physical device - #sectors, size, etc).
+ */
+static void blkfront_connect(struct blkfront_info *info)
+{
+	unsigned long long sectors;
+	unsigned long sector_size;
+	unsigned int binfo;
+	int err;
+
+	if ((info->connected == BLKIF_STATE_CONNECTED) ||
+	    (info->connected == BLKIF_STATE_SUSPENDED) )
+		return;
+
+	dev_dbg(&info->xbdev->dev, "%s:%s.\n",
+		__func__, info->xbdev->otherend);
+
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "sectors", "%llu", &sectors,
+			    "info", "%u", &binfo,
+			    "sector-size", "%lu", &sector_size,
+			    NULL);
+	if (err) {
+		xenbus_dev_fatal(info->xbdev, err,
+				 "reading backend fields at %s",
+				 info->xbdev->otherend);
+		return;
+	}
+
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "feature-barrier", "%lu", &info->feature_barrier,
+			    NULL);
+	if (err)
+		info->feature_barrier = 0;
+
+	err = xlvbd_alloc_gendisk(BLKIF_MINOR(info->vdevice),
+				  sectors, info->vdevice,
+				  binfo, sector_size, info);
+	if (err) {
+		xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
+				 info->xbdev->otherend);
+		return;
+	}
+
+	xenbus_switch_state(info->xbdev, XenbusStateConnected);
+
+	/* Kick pending requests. */
+	spin_lock_irq(&blkif_io_lock);
+	info->connected = BLKIF_STATE_CONNECTED;
+	kick_pending_request_queues(info);
+	spin_unlock_irq(&blkif_io_lock);
+
+	add_disk(info->gd);
+}
+
+/**
+ * Handle the change of state of the backend to Closing.  We must delete our
+ * device-layer structures now, to ensure that writes are flushed through to
+ * the backend.  Once is this done, we can switch to Closed in
+ * acknowledgement.
+ */
+static void blkfront_closing(struct xenbus_device *dev)
+{
+	struct blkfront_info *info = dev->dev.driver_data;
+	unsigned long flags;
+
+	dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename);
+
+	if (info->rq == NULL)
+		goto out;
+
+	spin_lock_irqsave(&blkif_io_lock, flags);
+
+	del_gendisk(info->gd);
+
+	/* No more blkif_request(). */
+	blk_stop_queue(info->rq);
+
+	/* No more gnttab callback work. */
+	gnttab_cancel_free_callback(&info->callback);
+	spin_unlock_irqrestore(&blkif_io_lock, flags);
+
+	/* Flush gnttab callback work. Must be done with no locks held. */
+	flush_scheduled_work();
+
+	blk_cleanup_queue(info->rq);
+	info->rq = NULL;
+
+ out:
+	xenbus_frontend_closed(dev);
+}
+
+/**
+ * Callback received when the backend's state changes.
+ */
+static void backend_changed(struct xenbus_device *dev,
+			    enum xenbus_state backend_state)
+{
+	struct blkfront_info *info = dev->dev.driver_data;
+	struct block_device *bd;
+
+	dev_dbg(&dev->dev, "blkfront:backend_changed.\n");
+
+	switch (backend_state) {
+	case XenbusStateInitialising:
+	case XenbusStateInitWait:
+	case XenbusStateInitialised:
+	case XenbusStateUnknown:
+	case XenbusStateClosed:
+		break;
+
+	case XenbusStateConnected:
+		blkfront_connect(info);
+		break;
+
+	case XenbusStateClosing:
+		bd = bdget(info->dev);
+		if (bd == NULL)
+			xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
+
+		mutex_lock(&bd->bd_mutex);
+		if (info->users > 0)
+			xenbus_dev_error(dev, -EBUSY,
+					 "Device in use; refusing to close");
+		else
+			blkfront_closing(dev);
+		mutex_unlock(&bd->bd_mutex);
+		bdput(bd);
+		break;
+	}
+}
+
+static int blkfront_remove(struct xenbus_device *dev)
+{
+	struct blkfront_info *info = dev->dev.driver_data;
+
+	dev_dbg(&dev->dev, "blkfront_remove: %s removed\n", dev->nodename);
+
+	blkif_free(info, 0);
+
+	kfree(info);
+
+	return 0;
+}
+
+static int blkif_open(struct inode *inode, struct file *filep)
+{
+	struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
+	info->users++;
+	return 0;
+}
+
+static int blkif_release(struct inode *inode, struct file *filep)
+{
+	struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
+	info->users--;
+	if (info->users == 0) {
+		/* Check whether we have been instructed to close.  We will
+		   have ignored this request initially, as the device was
+		   still mounted. */
+		struct xenbus_device *dev = info->xbdev;
+		enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
+
+		if (state == XenbusStateClosing)
+			blkfront_closing(dev);
+	}
+	return 0;
+}
+
+static struct block_device_operations xlvbd_block_fops =
+{
+	.owner = THIS_MODULE,
+	.open = blkif_open,
+	.release = blkif_release,
+};
+
+
+static struct xenbus_device_id blkfront_ids[] = {
+	{ "vbd" },
+	{ "" }
+};
+
+static struct xenbus_driver blkfront = {
+	.name = "vbd",
+	.owner = THIS_MODULE,
+	.ids = blkfront_ids,
+	.probe = blkfront_probe,
+	.remove = blkfront_remove,
+	.resume = blkfront_resume,
+	.otherend_changed = backend_changed,
+};
+
+static int __init xlblk_init(void)
+{
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
+		printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
+		       XENVBD_MAJOR, DEV_NAME);
+		return -ENODEV;
+	}
+
+	return xenbus_register_frontend(&blkfront);
+}
+module_init(xlblk_init);
+
+
+static void xlblk_exit(void)
+{
+	return xenbus_unregister_driver(&blkfront);
+}
+module_exit(xlblk_exit);
+
+MODULE_DESCRIPTION("Xen virtual block device frontend");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
diff --git a/include/linux/major.h b/include/linux/major.h
index 7e7c9093919a..0cb98053537a 100644
--- a/include/linux/major.h
+++ b/include/linux/major.h
@@ -158,6 +158,8 @@
 #define VXSPEC_MAJOR		200	/* VERITAS volume config driver */
 #define VXDMP_MAJOR		201	/* VERITAS volume multipath driver */
 
+#define XENVBD_MAJOR		202	/* Xen virtual block device */
+
 #define MSR_MAJOR		202
 #define CPUID_MAJOR		203
 
-- 
cgit v1.3-8-gc7d7


From 1c50dc83f9ca752b1e1b985f1ce33d2695103ffa Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@us.ibm.com>
Date: Tue, 30 Jan 2007 01:18:41 -0800
Subject: [SCSI] sas_ata: ata_post_internal should abort the sas_task

This patch adds a new field, lldd_task, to ata_queued_cmd so that libata
users such as libsas can associate some data with a qc.  The particular
ambition with this patch is to associate a sas_task with a qc; that way,
if libata decides to timeout a command, we can come back (in
sas_ata_post_internal) and abort the sas task.

One question remains: Is it necessary to reset the phy on error, or will
the libata error handler take care of it?  (Assuming that one is written,
of course.)  This patch, as it is today, works well enough to clean
things up when an ATA device probe attempt fails halfway through the probe,
though I'm not sure this is always the right thing to do.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/scsi/libsas/sas_ata.c | 30 +++++++++++++++++++++++++++---
 include/linux/libata.h        |  1 +
 2 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 46e1dbe1b843..c8af884abe18 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -88,12 +88,17 @@ static enum ata_completion_errors sas_to_ata_err(struct task_status_struct *ts)
 static void sas_ata_task_done(struct sas_task *task)
 {
 	struct ata_queued_cmd *qc = task->uldd_task;
-	struct domain_device *dev = qc->ap->private_data;
+	struct domain_device *dev;
 	struct task_status_struct *stat = &task->task_status;
 	struct ata_task_resp *resp = (struct ata_task_resp *)stat->buf;
 	enum ata_completion_errors ac;
 	unsigned long flags;
 
+	if (!qc)
+		goto qc_already_gone;
+
+	dev = qc->ap->private_data;
+
 	spin_lock_irqsave(dev->sata_dev.ap->lock, flags);
 	if (stat->stat == SAS_PROTO_RESPONSE) {
 		ata_tf_from_fis(resp->ending_fis, &dev->sata_dev.tf);
@@ -114,9 +119,11 @@ static void sas_ata_task_done(struct sas_task *task)
 		}
 	}
 
+	qc->lldd_task = NULL;
 	ata_qc_complete(qc);
 	spin_unlock_irqrestore(dev->sata_dev.ap->lock, flags);
 
+qc_already_gone:
 	list_del_init(&task->list);
 	sas_free_task(task);
 }
@@ -166,6 +173,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
 	task->scatter = qc->__sg;
 	task->ata_task.retry_count = 1;
 	task->task_state_flags = SAS_TASK_STATE_PENDING;
+	qc->lldd_task = task;
 
 	switch (qc->tf.protocol) {
 	case ATA_PROT_NCQ:
@@ -237,8 +245,24 @@ static void sas_ata_post_internal(struct ata_queued_cmd *qc)
 	if (qc->flags & ATA_QCFLAG_FAILED)
 		qc->err_mask |= AC_ERR_OTHER;
 
-	if (qc->err_mask)
-		SAS_DPRINTK("%s: Failure; reset phy!\n", __FUNCTION__);
+	if (qc->err_mask) {
+		/*
+		 * Find the sas_task and kill it.  By this point,
+		 * libata has decided to kill the qc, so we needn't
+		 * bother with sas_ata_task_done.  But we still
+		 * ought to abort the task.
+		 */
+		struct sas_task *task = qc->lldd_task;
+		struct domain_device *dev = qc->ap->private_data;
+
+		qc->lldd_task = NULL;
+		if (task) {
+			task->uldd_task = NULL;
+			__sas_task_abort(task);
+		}
+
+		sas_phy_reset(dev->port->phy, 1);
+	}
 }
 
 static void sas_ata_tf_read(struct ata_port *ap, struct ata_taskfile *tf)
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 47cd2a1c5544..4abb758a0450 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -411,6 +411,7 @@ struct ata_queued_cmd {
 	ata_qc_cb_t		complete_fn;
 
 	void			*private_data;
+	void			*lldd_task;
 };
 
 struct ata_port_stats {
-- 
cgit v1.3-8-gc7d7


From 0c8db6beb81a07147f64cffd33bd43b9e96f4f40 Mon Sep 17 00:00:00 2001
From: "Prakash, Sathya" <sathya.prakash@lsi.com>
Date: Tue, 17 Jul 2007 13:40:10 +0530
Subject: [SCSI] add PCI_VENDOR_ID macro for Brocade in pci_ids.h

Adds PCI_VENDOR_ID_BROCADE macro in include/linux/pci_ids.h file. This macro
is used in MPT Fusion FC drivers to support Brocade branded FC controllers

signed-off-by: Sathya Prakash <sathya.prakash@lsi.com>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 include/linux/pci_ids.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 2c7add169539..13d36bb01a42 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2017,6 +2017,8 @@
 
 #define PCI_VENDOR_ID_ARIMA		0x161f
 
+#define PCI_VENDOR_ID_BROCADE		0x1657
+
 #define PCI_VENDOR_ID_SIBYTE		0x166d
 #define PCI_DEVICE_ID_BCM1250_PCI	0x0001
 #define PCI_DEVICE_ID_BCM1250_HT	0x0002
-- 
cgit v1.3-8-gc7d7


From 7132ab7f6e0309bb8e0424e395ba149aee0c750e Mon Sep 17 00:00:00 2001
From: Andy Fleming <afleming@freescale.com>
Date: Wed, 11 Jul 2007 11:43:07 -0500
Subject: Fix RGMII-ID handling in gianfar

The TSEC/eTSEC can detect the interface to the PHY automatically,
but it isn't able to detect whether the RGMII connection needs internal
delay.  So we need to detect that change in the device tree, propagate
it to the platform data, and then check it if we're in RGMII.  This fixes
a bug on the 8641D HPCN board where the Vitesse PHY doesn't use the delay
for RGMII.

Signed-off-by: Andy Fleming <afleming@freescale.com>
---
 arch/powerpc/sysdev/fsl_soc.c |  9 +++++++++
 drivers/net/gianfar.c         | 12 +++++++++++-
 include/linux/fsl_devices.h   |  1 +
 3 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c
index c0ddc80d8160..3289fab01e92 100644
--- a/arch/powerpc/sysdev/fsl_soc.c
+++ b/arch/powerpc/sysdev/fsl_soc.c
@@ -197,6 +197,7 @@ static int __init gfar_of_init(void)
 		struct gianfar_platform_data gfar_data;
 		const unsigned int *id;
 		const char *model;
+		const char *ctype;
 		const void *mac_addr;
 		const phandle *ph;
 		int n_res = 2;
@@ -254,6 +255,14 @@ static int __init gfar_of_init(void)
 			    FSL_GIANFAR_DEV_HAS_VLAN |
 			    FSL_GIANFAR_DEV_HAS_EXTENDED_HASH;
 
+		ctype = of_get_property(np, "phy-connection-type", NULL);
+
+		/* We only care about rgmii-id.  The rest are autodetected */
+		if (ctype && !strcmp(ctype, "rgmii-id"))
+			gfar_data.interface = PHY_INTERFACE_MODE_RGMII_ID;
+		else
+			gfar_data.interface = PHY_INTERFACE_MODE_MII;
+
 		ph = of_get_property(np, "phy-handle", NULL);
 		phy = of_find_node_by_phandle(*ph);
 
diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index d7a1a58de766..f92690555dd9 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -420,8 +420,18 @@ static phy_interface_t gfar_get_interface(struct net_device *dev)
 	if (ecntrl & ECNTRL_REDUCED_MODE) {
 		if (ecntrl & ECNTRL_REDUCED_MII_MODE)
 			return PHY_INTERFACE_MODE_RMII;
-		else
+		else {
+			phy_interface_t interface = priv->einfo->interface;
+
+			/*
+			 * This isn't autodetected right now, so it must
+			 * be set by the device tree or platform code.
+			 */
+			if (interface == PHY_INTERFACE_MODE_RGMII_ID)
+				return PHY_INTERFACE_MODE_RGMII_ID;
+
 			return PHY_INTERFACE_MODE_RGMII;
+		}
 	}
 
 	if (priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_GIGABIT)
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index 695741b0e420..1831b196c70a 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -53,6 +53,7 @@ struct gianfar_platform_data {
 	u32	bus_id;
 	u32	phy_id;
 	u8	mac_addr[6];
+	phy_interface_t interface;
 };
 
 struct gianfar_mdio_data {
-- 
cgit v1.3-8-gc7d7


From b4ff4f0419ae5db83553fab79d03a89c10d540a8 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Wed, 18 Jul 2007 15:46:06 -0700
Subject: [NETLINK]: allocate group bitmaps dynamically

Allow changing the number of groups for a netlink family
after it has been created, use RCU to protect the listeners
bitmap keeping netlink_has_listeners() lock-free.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Patrick McHardy <kaber@trash.net>
Acked-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h  |   1 +
 net/netlink/af_netlink.c | 106 ++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 83 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 2e23353c28a5..b971ddd24090 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -161,6 +161,7 @@ extern struct sock *netlink_kernel_create(int unit, unsigned int groups,
 					  void (*input)(struct sock *sk, int len),
 					  struct mutex *cb_mutex,
 					  struct module *module);
+extern int netlink_change_ngroups(struct sock *sk, unsigned int groups);
 extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
 extern int netlink_has_listeners(struct sock *sk, unsigned int group);
 extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 pid, int nonblock);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 641cfbc278d8..c386eaf6ad5b 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -62,6 +62,7 @@
 #include <net/netlink.h>
 
 #define NLGRPSZ(x)	(ALIGN(x, sizeof(unsigned long) * 8) / 8)
+#define NLGRPLONGS(x)	(NLGRPSZ(x)/sizeof(unsigned long))
 
 struct netlink_sock {
 	/* struct sock has to be the first member of netlink_sock */
@@ -314,10 +315,12 @@ netlink_update_listeners(struct sock *sk)
 	unsigned long mask;
 	unsigned int i;
 
-	for (i = 0; i < NLGRPSZ(tbl->groups)/sizeof(unsigned long); i++) {
+	for (i = 0; i < NLGRPLONGS(tbl->groups); i++) {
 		mask = 0;
-		sk_for_each_bound(sk, node, &tbl->mc_list)
-			mask |= nlk_sk(sk)->groups[i];
+		sk_for_each_bound(sk, node, &tbl->mc_list) {
+			if (i < NLGRPLONGS(nlk_sk(sk)->ngroups))
+				mask |= nlk_sk(sk)->groups[i];
+		}
 		tbl->listeners[i] = mask;
 	}
 	/* this function is only called with the netlink table "grabbed", which
@@ -555,26 +558,37 @@ netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
 	nlk->subscriptions = subscriptions;
 }
 
-static int netlink_alloc_groups(struct sock *sk)
+static int netlink_realloc_groups(struct sock *sk)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
 	unsigned int groups;
+	unsigned long *new_groups;
 	int err = 0;
 
-	netlink_lock_table();
+	netlink_table_grab();
+
 	groups = nl_table[sk->sk_protocol].groups;
-	if (!nl_table[sk->sk_protocol].registered)
+	if (!nl_table[sk->sk_protocol].registered) {
 		err = -ENOENT;
-	netlink_unlock_table();
+		goto out_unlock;
+	}
 
-	if (err)
-		return err;
+	if (nlk->ngroups >= groups)
+		goto out_unlock;
 
-	nlk->groups = kzalloc(NLGRPSZ(groups), GFP_KERNEL);
-	if (nlk->groups == NULL)
-		return -ENOMEM;
+	new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC);
+	if (new_groups == NULL) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+	memset((char*)new_groups + NLGRPSZ(nlk->ngroups), 0,
+	       NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups));
+
+	nlk->groups = new_groups;
 	nlk->ngroups = groups;
-	return 0;
+ out_unlock:
+	netlink_table_ungrab();
+	return err;
 }
 
 static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
@@ -591,11 +605,9 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len
 	if (nladdr->nl_groups) {
 		if (!netlink_capable(sock, NL_NONROOT_RECV))
 			return -EPERM;
-		if (nlk->groups == NULL) {
-			err = netlink_alloc_groups(sk);
-			if (err)
-				return err;
-		}
+		err = netlink_realloc_groups(sk);
+		if (err)
+			return err;
 	}
 
 	if (nlk->pid) {
@@ -839,10 +851,18 @@ retry:
 int netlink_has_listeners(struct sock *sk, unsigned int group)
 {
 	int res = 0;
+	unsigned long *listeners;
 
 	BUG_ON(!(nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET));
+
+	rcu_read_lock();
+	listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners);
+
 	if (group - 1 < nl_table[sk->sk_protocol].groups)
-		res = test_bit(group - 1, nl_table[sk->sk_protocol].listeners);
+		res = test_bit(group - 1, listeners);
+
+	rcu_read_unlock();
+
 	return res;
 }
 EXPORT_SYMBOL_GPL(netlink_has_listeners);
@@ -1037,11 +1057,9 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 
 		if (!netlink_capable(sock, NL_NONROOT_RECV))
 			return -EPERM;
-		if (nlk->groups == NULL) {
-			err = netlink_alloc_groups(sk);
-			if (err)
-				return err;
-		}
+		err = netlink_realloc_groups(sk);
+		if (err)
+			return err;
 		if (!val || val - 1 >= nlk->ngroups)
 			return -EINVAL;
 		netlink_table_grab();
@@ -1328,6 +1346,46 @@ out_sock_release:
 	return NULL;
 }
 
+/**
+ * netlink_change_ngroups - change number of multicast groups
+ *
+ * This changes the number of multicast groups that are available
+ * on a certain netlink family. Note that it is not possible to
+ * change the number of groups to below 32.
+ *
+ * @sk: The kernel netlink socket, as returned by netlink_kernel_create().
+ * @groups: The new number of groups.
+ */
+int netlink_change_ngroups(struct sock *sk, unsigned int groups)
+{
+	unsigned long *listeners, *old = NULL;
+	struct netlink_table *tbl = &nl_table[sk->sk_protocol];
+	int err = 0;
+
+	if (groups < 32)
+		groups = 32;
+
+	netlink_table_grab();
+	if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
+		listeners = kzalloc(NLGRPSZ(groups), GFP_ATOMIC);
+		if (!listeners) {
+			err = -ENOMEM;
+			goto out_ungrab;
+		}
+		old = tbl->listeners;
+		memcpy(listeners, old, NLGRPSZ(tbl->groups));
+		rcu_assign_pointer(tbl->listeners, listeners);
+	}
+	tbl->groups = groups;
+
+ out_ungrab:
+	netlink_table_ungrab();
+	synchronize_rcu();
+	kfree(old);
+	return err;
+}
+EXPORT_SYMBOL(netlink_change_ngroups);
+
 void netlink_set_nonroot(int protocol, unsigned int flags)
 {
 	if ((unsigned int)protocol < MAX_LINKS)
-- 
cgit v1.3-8-gc7d7


From 84659eb529b33572bb3f8c94e0978bd5d084bc7e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Wed, 18 Jul 2007 15:47:05 -0700
Subject: [NETLIKN]: Allow removing multicast groups.

Allow kicking listeners out of a multicast group when necessary
(for example if that group is going to be removed.)

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Patrick McHardy <kaber@trash.net>
Acked-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h  |  1 +
 net/netlink/af_netlink.c | 57 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 46 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index b971ddd24090..83d8239f0cce 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -162,6 +162,7 @@ extern struct sock *netlink_kernel_create(int unit, unsigned int groups,
 					  struct mutex *cb_mutex,
 					  struct module *module);
 extern int netlink_change_ngroups(struct sock *sk, unsigned int groups);
+extern void netlink_clear_multicast_users(struct sock *sk, unsigned int group);
 extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
 extern int netlink_has_listeners(struct sock *sk, unsigned int group);
 extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 pid, int nonblock);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index c386eaf6ad5b..5681ce3aebca 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1027,6 +1027,23 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
 	read_unlock(&nl_table_lock);
 }
 
+/* must be called with netlink table grabbed */
+static void netlink_update_socket_mc(struct netlink_sock *nlk,
+				     unsigned int group,
+				     int is_new)
+{
+	int old, new = !!is_new, subscriptions;
+
+	old = test_bit(group - 1, nlk->groups);
+	subscriptions = nlk->subscriptions - old + new;
+	if (new)
+		__set_bit(group - 1, nlk->groups);
+	else
+		__clear_bit(group - 1, nlk->groups);
+	netlink_update_subscriptions(&nlk->sk, subscriptions);
+	netlink_update_listeners(&nlk->sk);
+}
+
 static int netlink_setsockopt(struct socket *sock, int level, int optname,
 			      char __user *optval, int optlen)
 {
@@ -1052,9 +1069,6 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 		break;
 	case NETLINK_ADD_MEMBERSHIP:
 	case NETLINK_DROP_MEMBERSHIP: {
-		unsigned int subscriptions;
-		int old, new = optname == NETLINK_ADD_MEMBERSHIP ? 1 : 0;
-
 		if (!netlink_capable(sock, NL_NONROOT_RECV))
 			return -EPERM;
 		err = netlink_realloc_groups(sk);
@@ -1063,14 +1077,8 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 		if (!val || val - 1 >= nlk->ngroups)
 			return -EINVAL;
 		netlink_table_grab();
-		old = test_bit(val - 1, nlk->groups);
-		subscriptions = nlk->subscriptions - old + new;
-		if (new)
-			__set_bit(val - 1, nlk->groups);
-		else
-			__clear_bit(val - 1, nlk->groups);
-		netlink_update_subscriptions(sk, subscriptions);
-		netlink_update_listeners(sk);
+		netlink_update_socket_mc(nlk, val,
+					 optname == NETLINK_ADD_MEMBERSHIP);
 		netlink_table_ungrab();
 		err = 0;
 		break;
@@ -1351,7 +1359,9 @@ out_sock_release:
  *
  * This changes the number of multicast groups that are available
  * on a certain netlink family. Note that it is not possible to
- * change the number of groups to below 32.
+ * change the number of groups to below 32. Also note that it does
+ * not implicitly call netlink_clear_multicast_users() when the
+ * number of groups is reduced.
  *
  * @sk: The kernel netlink socket, as returned by netlink_kernel_create().
  * @groups: The new number of groups.
@@ -1386,6 +1396,29 @@ int netlink_change_ngroups(struct sock *sk, unsigned int groups)
 }
 EXPORT_SYMBOL(netlink_change_ngroups);
 
+/**
+ * netlink_clear_multicast_users - kick off multicast listeners
+ *
+ * This function removes all listeners from the given group.
+ * @ksk: The kernel netlink socket, as returned by
+ *	netlink_kernel_create().
+ * @group: The multicast group to clear.
+ */
+void netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
+{
+	struct sock *sk;
+	struct hlist_node *node;
+	struct netlink_table *tbl = &nl_table[ksk->sk_protocol];
+
+	netlink_table_grab();
+
+	sk_for_each_bound(sk, node, &tbl->mc_list)
+		netlink_update_socket_mc(nlk_sk(sk), group, 0);
+
+	netlink_table_ungrab();
+}
+EXPORT_SYMBOL(netlink_clear_multicast_users);
+
 void netlink_set_nonroot(int protocol, unsigned int flags)
 {
 	if ((unsigned int)protocol < MAX_LINKS)
-- 
cgit v1.3-8-gc7d7


From 2dbba6f773d1e1e4c78f03b0dbf19790d9017693 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Wed, 18 Jul 2007 15:47:52 -0700
Subject: [GENETLINK]: Dynamic multicast groups.

Introduce API to dynamically register and unregister multicast groups.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Patrick McHardy <kaber@trash.net>
Acked-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/genetlink.h |  13 +++
 include/net/genetlink.h   |  22 +++++
 net/netlink/genetlink.c   | 235 ++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 263 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genetlink.h b/include/linux/genetlink.h
index f7a93770e1be..7da02c93002b 100644
--- a/include/linux/genetlink.h
+++ b/include/linux/genetlink.h
@@ -39,6 +39,9 @@ enum {
 	CTRL_CMD_NEWOPS,
 	CTRL_CMD_DELOPS,
 	CTRL_CMD_GETOPS,
+	CTRL_CMD_NEWMCAST_GRP,
+	CTRL_CMD_DELMCAST_GRP,
+	CTRL_CMD_GETMCAST_GRP, /* unused */
 	__CTRL_CMD_MAX,
 };
 
@@ -52,6 +55,7 @@ enum {
 	CTRL_ATTR_HDRSIZE,
 	CTRL_ATTR_MAXATTR,
 	CTRL_ATTR_OPS,
+	CTRL_ATTR_MCAST_GROUPS,
 	__CTRL_ATTR_MAX,
 };
 
@@ -66,4 +70,13 @@ enum {
 
 #define CTRL_ATTR_OP_MAX (__CTRL_ATTR_OP_MAX - 1)
 
+enum {
+	CTRL_ATTR_MCAST_GRP_UNSPEC,
+	CTRL_ATTR_MCAST_GRP_NAME,
+	CTRL_ATTR_MCAST_GRP_ID,
+	__CTRL_ATTR_MCAST_GRP_MAX,
+};
+
+#define CTRL_ATTR_MCAST_GRP_MAX (__CTRL_ATTR_MCAST_GRP_MAX - 1)
+
 #endif	/* __LINUX_GENERIC_NETLINK_H */
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index b6eaca122db8..decdda546829 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -4,6 +4,22 @@
 #include <linux/genetlink.h>
 #include <net/netlink.h>
 
+/**
+ * struct genl_multicast_group - generic netlink multicast group
+ * @name: name of the multicast group, names are per-family
+ * @id: multicast group ID, assigned by the core, to use with
+ *      genlmsg_multicast().
+ * @list: list entry for linking
+ * @family: pointer to family, need not be set before registering
+ */
+struct genl_multicast_group
+{
+	struct genl_family	*family;	/* private */
+	struct list_head	list;		/* private */
+	char			name[GENL_NAMSIZ];
+	u32			id;
+};
+
 /**
  * struct genl_family - generic netlink family
  * @id: protocol family idenfitier
@@ -14,6 +30,7 @@
  * @attrbuf: buffer to store parsed attributes
  * @ops_list: list of all assigned operations
  * @family_list: family list
+ * @mcast_groups: multicast groups list
  */
 struct genl_family
 {
@@ -25,6 +42,7 @@ struct genl_family
 	struct nlattr **	attrbuf;	/* private */
 	struct list_head	ops_list;	/* private */
 	struct list_head	family_list;	/* private */
+	struct list_head	mcast_groups;	/* private */
 };
 
 /**
@@ -73,6 +91,10 @@ extern int genl_register_family(struct genl_family *family);
 extern int genl_unregister_family(struct genl_family *family);
 extern int genl_register_ops(struct genl_family *, struct genl_ops *ops);
 extern int genl_unregister_ops(struct genl_family *, struct genl_ops *ops);
+extern int genl_register_mc_group(struct genl_family *family,
+				  struct genl_multicast_group *grp);
+extern void genl_unregister_mc_group(struct genl_family *family,
+				     struct genl_multicast_group *grp);
 
 extern struct sock *genl_sock;
 
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index b9ab62f938d0..e146531faf1d 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -3,6 +3,7 @@
  *
  * 		Authors:	Jamal Hadi Salim
  * 				Thomas Graf <tgraf@suug.ch>
+ *				Johannes Berg <johannes@sipsolutions.net>
  */
 
 #include <linux/module.h>
@@ -13,6 +14,7 @@
 #include <linux/string.h>
 #include <linux/skbuff.h>
 #include <linux/mutex.h>
+#include <linux/bitmap.h>
 #include <net/sock.h>
 #include <net/genetlink.h>
 
@@ -42,6 +44,16 @@ static void genl_unlock(void)
 #define GENL_FAM_TAB_MASK	(GENL_FAM_TAB_SIZE - 1)
 
 static struct list_head family_ht[GENL_FAM_TAB_SIZE];
+/*
+ * Bitmap of multicast groups that are currently in use.
+ *
+ * To avoid an allocation at boot of just one unsigned long,
+ * declare it global instead.
+ * Bit 0 is marked as already used since group 0 is invalid.
+ */
+static unsigned long mc_group_start = 0x1;
+static unsigned long *mc_groups = &mc_group_start;
+static unsigned long mc_groups_longs = 1;
 
 static int genl_ctrl_event(int event, void *data);
 
@@ -116,6 +128,114 @@ static inline u16 genl_generate_id(void)
 	return id_gen_idx;
 }
 
+static struct genl_multicast_group notify_grp;
+
+/**
+ * genl_register_mc_group - register a multicast group
+ *
+ * Registers the specified multicast group and notifies userspace
+ * about the new group.
+ *
+ * Returns 0 on success or a negative error code.
+ *
+ * @family: The generic netlink family the group shall be registered for.
+ * @grp: The group to register, must have a name.
+ */
+int genl_register_mc_group(struct genl_family *family,
+			   struct genl_multicast_group *grp)
+{
+	int id;
+	unsigned long *new_groups;
+	int err;
+
+	BUG_ON(grp->name[0] == '\0');
+
+	genl_lock();
+
+	/* special-case our own group */
+	if (grp == &notify_grp)
+		id = GENL_ID_CTRL;
+	else
+		id = find_first_zero_bit(mc_groups,
+					 mc_groups_longs * BITS_PER_LONG);
+
+
+	if (id >= mc_groups_longs * BITS_PER_LONG) {
+		size_t nlen = (mc_groups_longs + 1) * sizeof(unsigned long);
+
+		if (mc_groups == &mc_group_start) {
+			new_groups = kzalloc(nlen, GFP_KERNEL);
+			if (!new_groups) {
+				err = -ENOMEM;
+				goto out;
+			}
+			mc_groups = new_groups;
+			*mc_groups = mc_group_start;
+		} else {
+			new_groups = krealloc(mc_groups, nlen, GFP_KERNEL);
+			if (!new_groups) {
+				err = -ENOMEM;
+				goto out;
+			}
+			mc_groups = new_groups;
+			mc_groups[mc_groups_longs] = 0;
+		}
+		mc_groups_longs++;
+	}
+
+	err = netlink_change_ngroups(genl_sock,
+				     sizeof(unsigned long) * NETLINK_GENERIC);
+	if (err)
+		goto out;
+
+	grp->id = id;
+	set_bit(id, mc_groups);
+	list_add_tail(&grp->list, &family->mcast_groups);
+	grp->family = family;
+
+	genl_ctrl_event(CTRL_CMD_NEWMCAST_GRP, grp);
+ out:
+	genl_unlock();
+	return 0;
+}
+EXPORT_SYMBOL(genl_register_mc_group);
+
+/**
+ * genl_unregister_mc_group - unregister a multicast group
+ *
+ * Unregisters the specified multicast group and notifies userspace
+ * about it. All current listeners on the group are removed.
+ *
+ * Note: It is not necessary to unregister all multicast groups before
+ *       unregistering the family, unregistering the family will cause
+ *       all assigned multicast groups to be unregistered automatically.
+ *
+ * @family: Generic netlink family the group belongs to.
+ * @grp: The group to unregister, must have been registered successfully
+ *	 previously.
+ */
+void genl_unregister_mc_group(struct genl_family *family,
+			      struct genl_multicast_group *grp)
+{
+	BUG_ON(grp->family != family);
+	genl_lock();
+	netlink_clear_multicast_users(genl_sock, grp->id);
+	clear_bit(grp->id, mc_groups);
+	list_del(&grp->list);
+	genl_ctrl_event(CTRL_CMD_DELMCAST_GRP, grp);
+	grp->id = 0;
+	grp->family = NULL;
+	genl_unlock();
+}
+
+static void genl_unregister_mc_groups(struct genl_family *family)
+{
+	struct genl_multicast_group *grp, *tmp;
+
+	list_for_each_entry_safe(grp, tmp, &family->mcast_groups, list)
+		genl_unregister_mc_group(family, grp);
+}
+
 /**
  * genl_register_ops - register generic netlink operations
  * @family: generic netlink family
@@ -216,6 +336,7 @@ int genl_register_family(struct genl_family *family)
 		goto errout;
 
 	INIT_LIST_HEAD(&family->ops_list);
+	INIT_LIST_HEAD(&family->mcast_groups);
 
 	genl_lock();
 
@@ -275,6 +396,8 @@ int genl_unregister_family(struct genl_family *family)
 {
 	struct genl_family *rc;
 
+	genl_unregister_mc_groups(family);
+
 	genl_lock();
 
 	list_for_each_entry(rc, genl_family_chain(family->id), family_list) {
@@ -410,6 +533,67 @@ static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq,
 		nla_nest_end(skb, nla_ops);
 	}
 
+	if (!list_empty(&family->mcast_groups)) {
+		struct genl_multicast_group *grp;
+		struct nlattr *nla_grps;
+		int idx = 1;
+
+		nla_grps = nla_nest_start(skb, CTRL_ATTR_MCAST_GROUPS);
+		if (nla_grps == NULL)
+			goto nla_put_failure;
+
+		list_for_each_entry(grp, &family->mcast_groups, list) {
+			struct nlattr *nest;
+
+			nest = nla_nest_start(skb, idx++);
+			if (nest == NULL)
+				goto nla_put_failure;
+
+			NLA_PUT_U32(skb, CTRL_ATTR_MCAST_GRP_ID, grp->id);
+			NLA_PUT_STRING(skb, CTRL_ATTR_MCAST_GRP_NAME,
+				       grp->name);
+
+			nla_nest_end(skb, nest);
+		}
+		nla_nest_end(skb, nla_grps);
+	}
+
+	return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+	return genlmsg_cancel(skb, hdr);
+}
+
+static int ctrl_fill_mcgrp_info(struct genl_multicast_group *grp, u32 pid,
+				u32 seq, u32 flags, struct sk_buff *skb,
+				u8 cmd)
+{
+	void *hdr;
+	struct nlattr *nla_grps;
+	struct nlattr *nest;
+
+	hdr = genlmsg_put(skb, pid, seq, &genl_ctrl, flags, cmd);
+	if (hdr == NULL)
+		return -1;
+
+	NLA_PUT_STRING(skb, CTRL_ATTR_FAMILY_NAME, grp->family->name);
+	NLA_PUT_U16(skb, CTRL_ATTR_FAMILY_ID, grp->family->id);
+
+	nla_grps = nla_nest_start(skb, CTRL_ATTR_MCAST_GROUPS);
+	if (nla_grps == NULL)
+		goto nla_put_failure;
+
+	nest = nla_nest_start(skb, 1);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	NLA_PUT_U32(skb, CTRL_ATTR_MCAST_GRP_ID, grp->id);
+	NLA_PUT_STRING(skb, CTRL_ATTR_MCAST_GRP_NAME,
+		       grp->name);
+
+	nla_nest_end(skb, nest);
+	nla_nest_end(skb, nla_grps);
+
 	return genlmsg_end(skb, hdr);
 
 nla_put_failure:
@@ -453,8 +637,8 @@ errout:
 	return skb->len;
 }
 
-static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid,
-				      int seq, u8 cmd)
+static struct sk_buff *ctrl_build_family_msg(struct genl_family *family,
+					     u32 pid, int seq, u8 cmd)
 {
 	struct sk_buff *skb;
 	int err;
@@ -472,6 +656,25 @@ static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid,
 	return skb;
 }
 
+static struct sk_buff *ctrl_build_mcgrp_msg(struct genl_multicast_group *grp,
+					    u32 pid, int seq, u8 cmd)
+{
+	struct sk_buff *skb;
+	int err;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (skb == NULL)
+		return ERR_PTR(-ENOBUFS);
+
+	err = ctrl_fill_mcgrp_info(grp, pid, seq, 0, skb, cmd);
+	if (err < 0) {
+		nlmsg_free(skb);
+		return ERR_PTR(err);
+	}
+
+	return skb;
+}
+
 static const struct nla_policy ctrl_policy[CTRL_ATTR_MAX+1] = {
 	[CTRL_ATTR_FAMILY_ID]	= { .type = NLA_U16 },
 	[CTRL_ATTR_FAMILY_NAME]	= { .type = NLA_NUL_STRING,
@@ -501,8 +704,8 @@ static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info)
 		goto errout;
 	}
 
-	msg = ctrl_build_msg(res, info->snd_pid, info->snd_seq,
-			     CTRL_CMD_NEWFAMILY);
+	msg = ctrl_build_family_msg(res, info->snd_pid, info->snd_seq,
+				    CTRL_CMD_NEWFAMILY);
 	if (IS_ERR(msg)) {
 		err = PTR_ERR(msg);
 		goto errout;
@@ -523,7 +726,15 @@ static int genl_ctrl_event(int event, void *data)
 	switch (event) {
 	case CTRL_CMD_NEWFAMILY:
 	case CTRL_CMD_DELFAMILY:
-		msg = ctrl_build_msg(data, 0, 0, event);
+		msg = ctrl_build_family_msg(data, 0, 0, event);
+		if (IS_ERR(msg))
+			return PTR_ERR(msg);
+
+		genlmsg_multicast(msg, 0, GENL_ID_CTRL, GFP_KERNEL);
+		break;
+	case CTRL_CMD_NEWMCAST_GRP:
+	case CTRL_CMD_DELMCAST_GRP:
+		msg = ctrl_build_mcgrp_msg(data, 0, 0, event);
 		if (IS_ERR(msg))
 			return PTR_ERR(msg);
 
@@ -541,6 +752,10 @@ static struct genl_ops genl_ctrl_ops = {
 	.policy		= ctrl_policy,
 };
 
+static struct genl_multicast_group notify_grp = {
+	.name		= "notify",
+};
+
 static int __init genl_init(void)
 {
 	int i, err;
@@ -557,11 +772,17 @@ static int __init genl_init(void)
 		goto errout_register;
 
 	netlink_set_nonroot(NETLINK_GENERIC, NL_NONROOT_RECV);
-	genl_sock = netlink_kernel_create(NETLINK_GENERIC, GENL_MAX_ID,
-					  genl_rcv, NULL, THIS_MODULE);
+
+	/* we'll bump the group number right afterwards */
+	genl_sock = netlink_kernel_create(NETLINK_GENERIC, 0, genl_rcv,
+					  NULL, THIS_MODULE);
 	if (genl_sock == NULL)
 		panic("GENL: Cannot initialize generic netlink\n");
 
+	err = genl_register_mc_group(&genl_ctrl, &notify_grp);
+	if (err < 0)
+		goto errout_register;
+
 	return 0;
 
 errout_register:
-- 
cgit v1.3-8-gc7d7


From 60a96a59569bab85571d0089682109bd3324e896 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Sun, 8 Jul 2007 22:29:26 +0200
Subject: Driver core: accept all valid action-strings in uevent-trigger

This allows the uevent file to handle any type of uevent action to be
triggered by userspace instead of just the "add" uevent.


Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/core.c     | 23 ++++++++++++++++++++---
 include/linux/kobject.h | 25 +++++++++++++++++--------
 lib/kobject_uevent.c    | 30 ++++++++++--------------------
 3 files changed, 47 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 0455aa78fa13..91a0367d583e 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -24,6 +24,8 @@
 #include "base.h"
 #include "power/power.h"
 
+extern const char *kobject_actions[];
+
 int (*platform_notify)(struct device * dev) = NULL;
 int (*platform_notify_remove)(struct device * dev) = NULL;
 
@@ -303,10 +305,25 @@ out:
 static ssize_t store_uevent(struct device *dev, struct device_attribute *attr,
 			    const char *buf, size_t count)
 {
-	if (memcmp(buf, "add", 3) != 0)
-		dev_err(dev, "uevent: unsupported action-string; this will "
-			"be ignored in a future kernel version");
+	size_t len = count;
+	enum kobject_action action;
+
+	if (len && buf[len-1] == '\n')
+		len--;
+
+	for (action = 0; action < KOBJ_MAX; action++) {
+		if (strncmp(kobject_actions[action], buf, len) != 0)
+			continue;
+		if (kobject_actions[action][len] != '\0')
+			continue;
+		kobject_uevent(&dev->kobj, action);
+		goto out;
+	}
+
+	dev_err(dev, "uevent: unsupported action-string; this will "
+		     "be ignored in a future kernel version\n");
 	kobject_uevent(&dev->kobj, KOBJ_ADD);
+out:
 	return count;
 }
 
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 06cbf41d32d2..aa2fe22b1baa 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -36,15 +36,24 @@ extern char uevent_helper[];
 /* counter to tag the uevent, read only except for the kobject core */
 extern u64 uevent_seqnum;
 
-/* the actions here must match the proper string in lib/kobject_uevent.c */
-typedef int __bitwise kobject_action_t;
+/*
+ * The actions here must match the index to the string array
+ * in lib/kobject_uevent.c
+ *
+ * Do not add new actions here without checking with the driver-core
+ * maintainers. Action strings are not meant to express subsystem
+ * or device specific properties. In most cases you want to send a
+ * kobject_uevent_env(kobj, KOBJ_CHANGE, env) with additional event
+ * specific variables added to the event environment.
+ */
 enum kobject_action {
-	KOBJ_ADD	= (__force kobject_action_t) 0x01,	/* exclusive to core */
-	KOBJ_REMOVE	= (__force kobject_action_t) 0x02,	/* exclusive to core */
-	KOBJ_CHANGE	= (__force kobject_action_t) 0x03,	/* device state change */
-	KOBJ_OFFLINE	= (__force kobject_action_t) 0x04,	/* device offline */
-	KOBJ_ONLINE	= (__force kobject_action_t) 0x05,	/* device online */
-	KOBJ_MOVE	= (__force kobject_action_t) 0x06,	/* device move */
+	KOBJ_ADD,
+	KOBJ_REMOVE,
+	KOBJ_CHANGE,
+	KOBJ_MOVE,
+	KOBJ_ONLINE,
+	KOBJ_OFFLINE,
+	KOBJ_MAX
 };
 
 struct kobject {
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index bd5ecbbafab1..6a80c784a8fb 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -33,25 +33,15 @@ static DEFINE_SPINLOCK(sequence_lock);
 static struct sock *uevent_sock;
 #endif
 
-static char *action_to_string(enum kobject_action action)
-{
-	switch (action) {
-	case KOBJ_ADD:
-		return "add";
-	case KOBJ_REMOVE:
-		return "remove";
-	case KOBJ_CHANGE:
-		return "change";
-	case KOBJ_OFFLINE:
-		return "offline";
-	case KOBJ_ONLINE:
-		return "online";
-	case KOBJ_MOVE:
-		return "move";
-	default:
-		return NULL;
-	}
-}
+/* the strings here must match the enum in include/linux/kobject.h */
+const char *kobject_actions[] = {
+	"add",
+	"remove",
+	"change",
+	"move",
+	"online",
+	"offline",
+};
 
 /**
  * kobject_uevent_env - send an uevent with environmental data
@@ -83,7 +73,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 
 	pr_debug("%s\n", __FUNCTION__);
 
-	action_string = action_to_string(action);
+	action_string = kobject_actions[action];
 	if (!action_string) {
 		pr_debug("kobject attempted to send uevent without action_string!\n");
 		return -EINVAL;
-- 
cgit v1.3-8-gc7d7


From 3f8df781fc5f9ee5253a54ba669e1c8872844b86 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Thu, 12 Jul 2007 16:57:22 -0400
Subject: PM: remove deprecated dpm_runtime_* routines

This patch (as933) removes the deprecated dpm_runtime_suspend() and
dpm_runtime_resume() routines from the PM core.  The only user of
those routines is the PCMCIA ds driver; local replacements are added.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
CC: Dominik Brodowski <linux@dominikbrodowski.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/feature-removal-schedule.txt |  1 -
 drivers/base/power/Makefile                |  2 +-
 drivers/base/power/power.h                 |  5 --
 drivers/base/power/runtime.c               | 85 ------------------------------
 drivers/pcmcia/ds.c                        | 40 +++++++++++---
 include/linux/pm.h                         | 11 ----
 6 files changed, 35 insertions(+), 109 deletions(-)
 delete mode 100644 drivers/base/power/runtime.c

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 9cf9d8346289..1b5c70758a1a 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -27,7 +27,6 @@ Who:	Hans Verkuil <hverkuil@xs4all.nl> and
 ---------------------------
 
 What:	dev->power.power_state
-	dpm_runtime_{suspend,resume)()
 When:	July 2007
 Why:	Broken design for runtime control over driver power states, confusing
 	driver-internal runtime power management with:  mechanisms to support
diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile
index 91f230939c1e..fff178007208 100644
--- a/drivers/base/power/Makefile
+++ b/drivers/base/power/Makefile
@@ -1,5 +1,5 @@
 obj-y			:= shutdown.o
-obj-$(CONFIG_PM)	+= main.o suspend.o resume.o runtime.o sysfs.o
+obj-$(CONFIG_PM)	+= main.o suspend.o resume.o sysfs.o
 obj-$(CONFIG_PM_TRACE)	+= trace.o
 
 ifeq ($(CONFIG_DEBUG_DRIVER),y)
diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h
index 2760f25b3ac5..591a0dd5deee 100644
--- a/drivers/base/power/power.h
+++ b/drivers/base/power/power.h
@@ -62,11 +62,6 @@ extern int resume_device(struct device *);
  */
 extern int suspend_device(struct device *, pm_message_t);
 
-
-/*
- * runtime.c
- */
-
 #else /* CONFIG_PM */
 
 
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
deleted file mode 100644
index df6174d85866..000000000000
--- a/drivers/base/power/runtime.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * drivers/base/power/runtime.c - Handling dynamic device power management.
- *
- * Copyright (c) 2003 Patrick Mochel
- * Copyright (c) 2003 Open Source Development Lab
- *
- */
-
-#include <linux/device.h>
-#include "power.h"
-
-
-static void runtime_resume(struct device * dev)
-{
-	dev_dbg(dev, "resuming\n");
-	if (!dev->power.power_state.event)
-		return;
-	if (!resume_device(dev))
-		dev->power.power_state = PMSG_ON;
-}
-
-
-/**
- *	dpm_runtime_resume - Power one device back on.
- *	@dev:	Device.
- *
- *	Bring one device back to the on state by first powering it
- *	on, then restoring state. We only operate on devices that aren't
- *	already on.
- *	FIXME: We need to handle devices that are in an unknown state.
- */
-
-void dpm_runtime_resume(struct device * dev)
-{
-	mutex_lock(&dpm_mtx);
-	runtime_resume(dev);
-	mutex_unlock(&dpm_mtx);
-}
-EXPORT_SYMBOL(dpm_runtime_resume);
-
-
-/**
- *	dpm_runtime_suspend - Put one device in low-power state.
- *	@dev:	Device.
- *	@state:	State to enter.
- */
-
-int dpm_runtime_suspend(struct device * dev, pm_message_t state)
-{
-	int error = 0;
-
-	mutex_lock(&dpm_mtx);
-	if (dev->power.power_state.event == state.event)
-		goto Done;
-
-	if (dev->power.power_state.event)
-		runtime_resume(dev);
-
-	if (!(error = suspend_device(dev, state)))
-		dev->power.power_state = state;
- Done:
-	mutex_unlock(&dpm_mtx);
-	return error;
-}
-EXPORT_SYMBOL(dpm_runtime_suspend);
-
-
-#if 0
-/**
- *	dpm_set_power_state - Update power_state field.
- *	@dev:	Device.
- *	@state:	Power state device is in.
- *
- *	This is an update mechanism for drivers to notify the core
- *	what power state a device is in. Device probing code may not
- *	always be able to tell, but we need accurate information to
- *	work reliably.
- */
-void dpm_set_power_state(struct device * dev, pm_message_t state)
-{
-	mutex_lock(&dpm_mtx);
-	dev->power.power_state = state;
-	mutex_unlock(&dpm_mtx);
-}
-#endif  /*  0  */
diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c
index 143c6efc478a..a99607142fc8 100644
--- a/drivers/pcmcia/ds.c
+++ b/drivers/pcmcia/ds.c
@@ -1127,6 +1127,34 @@ static int pcmcia_bus_uevent(struct device *dev, char **envp, int num_envp,
 
 #endif
 
+/************************ runtime PM support ***************************/
+
+static int pcmcia_dev_suspend(struct device *dev, pm_message_t state);
+static int pcmcia_dev_resume(struct device *dev);
+
+static int runtime_suspend(struct device *dev)
+{
+	int rc;
+
+	down(&dev->sem);
+	rc = pcmcia_dev_suspend(dev, PMSG_SUSPEND);
+	up(&dev->sem);
+	if (!rc)
+		dev->power.power_state.event = PM_EVENT_SUSPEND;
+	return rc;
+}
+
+static void runtime_resume(struct device *dev)
+{
+	int rc;
+
+	down(&dev->sem);
+	rc = pcmcia_dev_resume(dev);
+	up(&dev->sem);
+	if (!rc)
+		dev->power.power_state.event = PM_EVENT_ON;
+}
+
 /************************ per-device sysfs output ***************************/
 
 #define pcmcia_device_attr(field, test, format)				\
@@ -1173,9 +1201,9 @@ static ssize_t pcmcia_store_pm_state(struct device *dev, struct device_attribute
                 return -EINVAL;
 
 	if ((!p_dev->suspended) && !strncmp(buf, "off", 3))
-		ret = dpm_runtime_suspend(dev, PMSG_SUSPEND);
+		ret = runtime_suspend(dev);
 	else if (p_dev->suspended && !strncmp(buf, "on", 2))
-		dpm_runtime_resume(dev);
+		runtime_resume(dev);
 
 	return ret ? ret : count;
 }
@@ -1312,10 +1340,10 @@ static int pcmcia_bus_suspend_callback(struct device *dev, void * _data)
 	struct pcmcia_socket *skt = _data;
 	struct pcmcia_device *p_dev = to_pcmcia_dev(dev);
 
-	if (p_dev->socket != skt)
+	if (p_dev->socket != skt || p_dev->suspended)
 		return 0;
 
-	return dpm_runtime_suspend(dev, PMSG_SUSPEND);
+	return runtime_suspend(dev);
 }
 
 static int pcmcia_bus_resume_callback(struct device *dev, void * _data)
@@ -1323,10 +1351,10 @@ static int pcmcia_bus_resume_callback(struct device *dev, void * _data)
 	struct pcmcia_socket *skt = _data;
 	struct pcmcia_device *p_dev = to_pcmcia_dev(dev);
 
-	if (p_dev->socket != skt)
+	if (p_dev->socket != skt || !p_dev->suspended)
 		return 0;
 
-	dpm_runtime_resume(dev);
+	runtime_resume(dev);
 
 	return 0;
 }
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 273781c82e4d..2735b7cadd20 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -284,8 +284,6 @@ extern int device_prepare_suspend(pm_message_t state);
 #define device_may_wakeup(dev) \
 	(device_can_wakeup(dev) && (dev)->power.should_wakeup)
 
-extern int dpm_runtime_suspend(struct device *, pm_message_t);
-extern void dpm_runtime_resume(struct device *);
 extern void __suspend_report_result(const char *function, void *fn, int ret);
 
 #define suspend_report_result(fn, ret)					\
@@ -317,15 +315,6 @@ static inline int device_suspend(pm_message_t state)
 #define device_set_wakeup_enable(dev,val)	do{}while(0)
 #define device_may_wakeup(dev)			(0)
 
-static inline int dpm_runtime_suspend(struct device * dev, pm_message_t state)
-{
-	return 0;
-}
-
-static inline void dpm_runtime_resume(struct device * dev)
-{
-}
-
 #define suspend_report_result(fn, ret) do { } while (0)
 
 static inline int call_platform_enable_wakeup(struct device *dev, int is_on)
-- 
cgit v1.3-8-gc7d7


From aebdc3b450a3febf7d7d00cd2235509055ec7082 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Thu, 12 Jul 2007 22:08:22 -0700
Subject: dev_vdbg(), available with -DVERBOSE_DEBUG

This defines a dev_vdbg() call, which is enabled with -DVERBOSE_DEBUG.
When enabled, dev_vdbg() acts just like dev_dbg().  When disabled, it is a
NOP ...  just like dev_dbg() without -DDEBUG.  The specific code was moved
out of a USB patch, but lots of drivers have similar support.

That is, code can now be written to use an additional level of debug
output, selected at compile time.  Many driver authors have found this
idiom to be very useful.  A typical usage model is for "normal" debug
messages to focus on fault paths and not be very "chatty", so that those
messages can be left on during normal operation without much of a
performance or syslog load.  On the other hand "verbose" messages would be
noisy enough that they wouldn't normally be enabled; they might even affect
timings enough to change system or driver behavior.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/driver.c |  7 -------
 include/linux/device.h    | 10 ++++++++++
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 73c49362cd47..654857493a82 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -29,13 +29,6 @@
 #include "hcd.h"
 #include "usb.h"
 
-#define VERBOSE_DEBUG	0
-
-#if VERBOSE_DEBUG
-#define dev_vdbg	dev_dbg
-#else
-#define dev_vdbg(dev, fmt, args...)	do { } while (0)
-#endif
 
 #ifdef CONFIG_HOTPLUG
 
diff --git a/include/linux/device.h b/include/linux/device.h
index be2debed70d2..d9f0a57f5a2f 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -572,6 +572,16 @@ dev_dbg(struct device * dev, const char * fmt, ...)
 }
 #endif
 
+#ifdef VERBOSE_DEBUG
+#define dev_vdbg	dev_dbg
+#else
+static inline int __attribute__ ((format (printf, 2, 3)))
+dev_vdbg(struct device * dev, const char * fmt, ...)
+{
+	return 0;
+}
+#endif
+
 #define dev_err(dev, format, arg...)		\
 	dev_printk(KERN_ERR , dev , format , ## arg)
 #define dev_info(dev, format, arg...)		\
-- 
cgit v1.3-8-gc7d7


From beafc54c4e2fba24e1ca45cdb7f79d9aa83e3db1 Mon Sep 17 00:00:00 2001
From: "Hans J. Koch" <hjk@linutronix.de>
Date: Thu, 7 Dec 2006 10:58:29 +0100
Subject: UIO: Add the User IO core code

This interface allows the ability to write the majority of a driver in
userspace with only a very small shell of a driver in the kernel itself.
It uses a char device and sysfs to interact with a userspace process to
process interrupts and control memory accesses.

See the docbook documentation for more details on how to use this
interface.

From: Hans J. Koch <hjk@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Benedikt Spranger <b.spranger@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/Kconfig            |   1 +
 drivers/Makefile           |   1 +
 drivers/uio/Kconfig        |  16 ++
 drivers/uio/Makefile       |   1 +
 drivers/uio/uio.c          | 701 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/uio_driver.h |  91 ++++++
 6 files changed, 811 insertions(+)
 create mode 100644 drivers/uio/Kconfig
 create mode 100644 drivers/uio/Makefile
 create mode 100644 drivers/uio/uio.c
 create mode 100644 include/linux/uio_driver.h

(limited to 'include/linux')

diff --git a/drivers/Kconfig b/drivers/Kconfig
index 7916f4b86d23..ae01d86070bb 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -84,4 +84,5 @@ source "drivers/auxdisplay/Kconfig"
 
 source "drivers/kvm/Kconfig"
 
+source "drivers/uio/Kconfig"
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 6d9d7fab77f5..c34c8efff609 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_ATA)		+= ata/
 obj-$(CONFIG_FUSION)		+= message/
 obj-$(CONFIG_FIREWIRE)		+= firewire/
 obj-$(CONFIG_IEEE1394)		+= ieee1394/
+obj-$(CONFIG_UIO)		+= uio/
 obj-y				+= cdrom/
 obj-y				+= auxdisplay/
 obj-$(CONFIG_MTD)		+= mtd/
diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
new file mode 100644
index 000000000000..6b1a62d45e46
--- /dev/null
+++ b/drivers/uio/Kconfig
@@ -0,0 +1,16 @@
+menu "Userspace I/O"
+	depends on !S390
+
+config UIO
+	tristate "Userspace I/O drivers"
+	default n
+	help
+	  Enable this to allow the userspace driver core code to be
+	  built.  This code allows userspace programs easy access to
+	  kernel interrupts and memory locations, allowing some drivers
+	  to be written in userspace.  Note that a small kernel driver
+	  is also required for interrupt handling to work properly.
+
+	  If you don't know what to do here, say N.
+
+endmenu
diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
new file mode 100644
index 000000000000..9b7c83063e15
--- /dev/null
+++ b/drivers/uio/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_UIO) += uio.o
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
new file mode 100644
index 000000000000..865f32b63b5c
--- /dev/null
+++ b/drivers/uio/uio.c
@@ -0,0 +1,701 @@
+/*
+ * drivers/uio/uio.c
+ *
+ * Copyright(C) 2005, Benedikt Spranger <b.spranger@linutronix.de>
+ * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2006, Hans J. Koch <hjk@linutronix.de>
+ * Copyright(C) 2006, Greg Kroah-Hartman <greg@kroah.com>
+ *
+ * Userspace IO
+ *
+ * Base Functions
+ *
+ * Licensed under the GPLv2 only.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/device.h>
+#include <linux/mm.h>
+#include <linux/idr.h>
+#include <linux/string.h>
+#include <linux/kobject.h>
+#include <linux/uio_driver.h>
+
+#define UIO_MAX_DEVICES 255
+
+struct uio_device {
+	struct module		*owner;
+	struct device		*dev;
+	int			minor;
+	atomic_t		event;
+	struct fasync_struct	*async_queue;
+	wait_queue_head_t	wait;
+	int			vma_count;
+	struct uio_info		*info;
+	struct kset 		map_attr_kset;
+};
+
+static int uio_major;
+static DEFINE_IDR(uio_idr);
+static struct file_operations uio_fops;
+
+/* UIO class infrastructure */
+static struct uio_class {
+	struct kref kref;
+	struct class *class;
+} *uio_class;
+
+/*
+ * attributes
+ */
+
+static struct attribute attr_addr = {
+	.name  = "addr",
+	.mode  = S_IRUGO,
+};
+
+static struct attribute attr_size = {
+	.name  = "size",
+	.mode  = S_IRUGO,
+};
+
+static struct attribute* map_attrs[] = {
+	&attr_addr, &attr_size, NULL
+};
+
+static ssize_t map_attr_show(struct kobject *kobj, struct attribute *attr,
+			     char *buf)
+{
+	struct uio_mem *mem = container_of(kobj, struct uio_mem, kobj);
+
+	if (strncmp(attr->name,"addr",4) == 0)
+		return sprintf(buf, "0x%lx\n", mem->addr);
+
+	if (strncmp(attr->name,"size",4) == 0)
+		return sprintf(buf, "0x%lx\n", mem->size);
+
+	return -ENODEV;
+}
+
+static void map_attr_release(struct kobject *kobj)
+{
+	/* TODO ??? */
+}
+
+static struct sysfs_ops map_attr_ops = {
+	.show  = map_attr_show,
+};
+
+static struct kobj_type map_attr_type = {
+	.release	= map_attr_release,
+	.sysfs_ops	= &map_attr_ops,
+	.default_attrs	= map_attrs,
+};
+
+static ssize_t show_name(struct device *dev,
+			 struct device_attribute *attr, char *buf)
+{
+	struct uio_device *idev = dev_get_drvdata(dev);
+	if (idev)
+		return sprintf(buf, "%s\n", idev->info->name);
+	else
+		return -ENODEV;
+}
+static DEVICE_ATTR(name, S_IRUGO, show_name, NULL);
+
+static ssize_t show_version(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct uio_device *idev = dev_get_drvdata(dev);
+	if (idev)
+		return sprintf(buf, "%s\n", idev->info->version);
+	else
+		return -ENODEV;
+}
+static DEVICE_ATTR(version, S_IRUGO, show_version, NULL);
+
+static ssize_t show_event(struct device *dev,
+			  struct device_attribute *attr, char *buf)
+{
+	struct uio_device *idev = dev_get_drvdata(dev);
+	if (idev)
+		return sprintf(buf, "%u\n",
+				(unsigned int)atomic_read(&idev->event));
+	else
+		return -ENODEV;
+}
+static DEVICE_ATTR(event, S_IRUGO, show_event, NULL);
+
+static struct attribute *uio_attrs[] = {
+	&dev_attr_name.attr,
+	&dev_attr_version.attr,
+	&dev_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group uio_attr_grp = {
+	.attrs = uio_attrs,
+};
+
+/*
+ * device functions
+ */
+static int uio_dev_add_attributes(struct uio_device *idev)
+{
+	int ret;
+	int mi;
+	int map_found = 0;
+	struct uio_mem *mem;
+
+	ret = sysfs_create_group(&idev->dev->kobj, &uio_attr_grp);
+	if (ret)
+		goto err_group;
+
+	for (mi = 0; mi < MAX_UIO_MAPS; mi++) {
+		mem = &idev->info->mem[mi];
+		if (mem->size == 0)
+			break;
+		if (!map_found) {
+			map_found = 1;
+			kobject_set_name(&idev->map_attr_kset.kobj,"maps");
+			idev->map_attr_kset.ktype = &map_attr_type;
+			idev->map_attr_kset.kobj.parent = &idev->dev->kobj;
+			ret = kset_register(&idev->map_attr_kset);
+			if (ret)
+				goto err_remove_group;
+		}
+		kobject_init(&mem->kobj);
+		kobject_set_name(&mem->kobj,"map%d",mi);
+		mem->kobj.parent = &idev->map_attr_kset.kobj;
+		mem->kobj.kset = &idev->map_attr_kset;
+		ret = kobject_add(&mem->kobj);
+		if (ret)
+			goto err_remove_maps;
+	}
+
+	return 0;
+
+err_remove_maps:
+	for (mi--; mi>=0; mi--) {
+		mem = &idev->info->mem[mi];
+		kobject_unregister(&mem->kobj);
+	}
+	kset_unregister(&idev->map_attr_kset); /* Needed ? */
+err_remove_group:
+	sysfs_remove_group(&idev->dev->kobj, &uio_attr_grp);
+err_group:
+	dev_err(idev->dev, "error creating sysfs files (%d)\n", ret);
+	return ret;
+}
+
+static void uio_dev_del_attributes(struct uio_device *idev)
+{
+	int mi;
+	struct uio_mem *mem;
+	for (mi = 0; mi < MAX_UIO_MAPS; mi++) {
+		mem = &idev->info->mem[mi];
+		if (mem->size == 0)
+			break;
+		kobject_unregister(&mem->kobj);
+	}
+	kset_unregister(&idev->map_attr_kset);
+	sysfs_remove_group(&idev->dev->kobj, &uio_attr_grp);
+}
+
+static int uio_get_minor(struct uio_device *idev)
+{
+	static DEFINE_MUTEX(minor_lock);
+	int retval = -ENOMEM;
+	int id;
+
+	mutex_lock(&minor_lock);
+	if (idr_pre_get(&uio_idr, GFP_KERNEL) == 0)
+		goto exit;
+
+	retval = idr_get_new(&uio_idr, idev, &id);
+	if (retval < 0) {
+		if (retval == -EAGAIN)
+			retval = -ENOMEM;
+		goto exit;
+	}
+	idev->minor = id & MAX_ID_MASK;
+exit:
+	mutex_unlock(&minor_lock);
+	return retval;
+}
+
+static void uio_free_minor(struct uio_device *idev)
+{
+	idr_remove(&uio_idr, idev->minor);
+}
+
+/**
+ * uio_event_notify - trigger an interrupt event
+ * @info: UIO device capabilities
+ */
+void uio_event_notify(struct uio_info *info)
+{
+	struct uio_device *idev = info->uio_dev;
+
+	atomic_inc(&idev->event);
+	wake_up_interruptible(&idev->wait);
+	kill_fasync(&idev->async_queue, SIGIO, POLL_IN);
+}
+EXPORT_SYMBOL_GPL(uio_event_notify);
+
+/**
+ * uio_interrupt - hardware interrupt handler
+ * @irq: IRQ number, can be UIO_IRQ_CYCLIC for cyclic timer
+ * @dev_id: Pointer to the devices uio_device structure
+ */
+static irqreturn_t uio_interrupt(int irq, void *dev_id)
+{
+	struct uio_device *idev = (struct uio_device *)dev_id;
+	irqreturn_t ret = idev->info->handler(irq, idev->info);
+
+	if (ret == IRQ_HANDLED)
+		uio_event_notify(idev->info);
+
+	return ret;
+}
+
+struct uio_listener {
+	struct uio_device *dev;
+	s32 event_count;
+};
+
+static int uio_open(struct inode *inode, struct file *filep)
+{
+	struct uio_device *idev;
+	struct uio_listener *listener;
+	int ret = 0;
+
+	idev = idr_find(&uio_idr, iminor(inode));
+	if (!idev)
+		return -ENODEV;
+
+	listener = kmalloc(sizeof(*listener), GFP_KERNEL);
+	if (!listener)
+		return -ENOMEM;
+
+	listener->dev = idev;
+	listener->event_count = atomic_read(&idev->event);
+	filep->private_data = listener;
+
+	if (idev->info->open) {
+		if (!try_module_get(idev->owner))
+			return -ENODEV;
+		ret = idev->info->open(idev->info, inode);
+		module_put(idev->owner);
+	}
+
+	if (ret)
+		kfree(listener);
+
+	return ret;
+}
+
+static int uio_fasync(int fd, struct file *filep, int on)
+{
+	struct uio_listener *listener = filep->private_data;
+	struct uio_device *idev = listener->dev;
+
+	return fasync_helper(fd, filep, on, &idev->async_queue);
+}
+
+static int uio_release(struct inode *inode, struct file *filep)
+{
+	int ret = 0;
+	struct uio_listener *listener = filep->private_data;
+	struct uio_device *idev = listener->dev;
+
+	if (idev->info->release) {
+		if (!try_module_get(idev->owner))
+			return -ENODEV;
+		ret = idev->info->release(idev->info, inode);
+		module_put(idev->owner);
+	}
+	if (filep->f_flags & FASYNC)
+		ret = uio_fasync(-1, filep, 0);
+	kfree(listener);
+	return ret;
+}
+
+static unsigned int uio_poll(struct file *filep, poll_table *wait)
+{
+	struct uio_listener *listener = filep->private_data;
+	struct uio_device *idev = listener->dev;
+
+	if (idev->info->irq == UIO_IRQ_NONE)
+		return -EIO;
+
+	poll_wait(filep, &idev->wait, wait);
+	if (listener->event_count != atomic_read(&idev->event))
+		return POLLIN | POLLRDNORM;
+	return 0;
+}
+
+static ssize_t uio_read(struct file *filep, char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	struct uio_listener *listener = filep->private_data;
+	struct uio_device *idev = listener->dev;
+	DECLARE_WAITQUEUE(wait, current);
+	ssize_t retval;
+	s32 event_count;
+
+	if (idev->info->irq == UIO_IRQ_NONE)
+		return -EIO;
+
+	if (count != sizeof(s32))
+		return -EINVAL;
+
+	add_wait_queue(&idev->wait, &wait);
+
+	do {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		event_count = atomic_read(&idev->event);
+		if (event_count != listener->event_count) {
+			if (copy_to_user(buf, &event_count, count))
+				retval = -EFAULT;
+			else {
+				listener->event_count = event_count;
+				retval = count;
+			}
+			break;
+		}
+
+		if (filep->f_flags & O_NONBLOCK) {
+			retval = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			retval = -ERESTARTSYS;
+			break;
+		}
+		schedule();
+	} while (1);
+
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(&idev->wait, &wait);
+
+	return retval;
+}
+
+static int uio_find_mem_index(struct vm_area_struct *vma)
+{
+	int mi;
+	struct uio_device *idev = vma->vm_private_data;
+
+	for (mi = 0; mi < MAX_UIO_MAPS; mi++) {
+		if (idev->info->mem[mi].size == 0)
+			return -1;
+		if (vma->vm_pgoff == mi)
+			return mi;
+	}
+	return -1;
+}
+
+static void uio_vma_open(struct vm_area_struct *vma)
+{
+	struct uio_device *idev = vma->vm_private_data;
+	idev->vma_count++;
+}
+
+static void uio_vma_close(struct vm_area_struct *vma)
+{
+	struct uio_device *idev = vma->vm_private_data;
+	idev->vma_count--;
+}
+
+static struct page *uio_vma_nopage(struct vm_area_struct *vma,
+				   unsigned long address, int *type)
+{
+	struct uio_device *idev = vma->vm_private_data;
+	struct page* page = NOPAGE_SIGBUS;
+
+	int mi = uio_find_mem_index(vma);
+	if (mi < 0)
+		return page;
+
+	if (idev->info->mem[mi].memtype == UIO_MEM_LOGICAL)
+		page = virt_to_page(idev->info->mem[mi].addr);
+	else
+		page = vmalloc_to_page((void*)idev->info->mem[mi].addr);
+	get_page(page);
+	if (type)
+		*type = VM_FAULT_MINOR;
+	return page;
+}
+
+static struct vm_operations_struct uio_vm_ops = {
+	.open = uio_vma_open,
+	.close = uio_vma_close,
+	.nopage = uio_vma_nopage,
+};
+
+static int uio_mmap_physical(struct vm_area_struct *vma)
+{
+	struct uio_device *idev = vma->vm_private_data;
+	int mi = uio_find_mem_index(vma);
+	if (mi < 0)
+		return -EINVAL;
+
+	vma->vm_flags |= VM_IO | VM_RESERVED;
+
+	return remap_pfn_range(vma,
+			       vma->vm_start,
+			       idev->info->mem[mi].addr >> PAGE_SHIFT,
+			       vma->vm_end - vma->vm_start,
+			       vma->vm_page_prot);
+}
+
+static int uio_mmap_logical(struct vm_area_struct *vma)
+{
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_ops = &uio_vm_ops;
+	uio_vma_open(vma);
+	return 0;
+}
+
+static int uio_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+	struct uio_listener *listener = filep->private_data;
+	struct uio_device *idev = listener->dev;
+	int mi;
+	unsigned long requested_pages, actual_pages;
+	int ret = 0;
+
+	if (vma->vm_end < vma->vm_start)
+		return -EINVAL;
+
+	vma->vm_private_data = idev;
+
+	mi = uio_find_mem_index(vma);
+	if (mi < 0)
+		return -EINVAL;
+
+	requested_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	actual_pages = (idev->info->mem[mi].size + PAGE_SIZE -1) >> PAGE_SHIFT;
+	if (requested_pages > actual_pages)
+		return -EINVAL;
+
+	if (idev->info->mmap) {
+		if (!try_module_get(idev->owner))
+			return -ENODEV;
+		ret = idev->info->mmap(idev->info, vma);
+		module_put(idev->owner);
+		return ret;
+	}
+
+	switch (idev->info->mem[mi].memtype) {
+		case UIO_MEM_PHYS:
+			return uio_mmap_physical(vma);
+		case UIO_MEM_LOGICAL:
+		case UIO_MEM_VIRTUAL:
+			return uio_mmap_logical(vma);
+		default:
+			return -EINVAL;
+	}
+}
+
+static struct file_operations uio_fops = {
+	.owner		= THIS_MODULE,
+	.open		= uio_open,
+	.release	= uio_release,
+	.read		= uio_read,
+	.mmap		= uio_mmap,
+	.poll		= uio_poll,
+	.fasync		= uio_fasync,
+};
+
+static int uio_major_init(void)
+{
+	uio_major = register_chrdev(0, "uio", &uio_fops);
+	if (uio_major < 0)
+		return uio_major;
+	return 0;
+}
+
+static void uio_major_cleanup(void)
+{
+	unregister_chrdev(uio_major, "uio");
+}
+
+static int init_uio_class(void)
+{
+	int ret = 0;
+
+	if (uio_class != NULL) {
+		kref_get(&uio_class->kref);
+		goto exit;
+	}
+
+	/* This is the first time in here, set everything up properly */
+	ret = uio_major_init();
+	if (ret)
+		goto exit;
+
+	uio_class = kzalloc(sizeof(*uio_class), GFP_KERNEL);
+	if (!uio_class) {
+		ret = -ENOMEM;
+		goto err_kzalloc;
+	}
+
+	kref_init(&uio_class->kref);
+	uio_class->class = class_create(THIS_MODULE, "uio");
+	if (IS_ERR(uio_class->class)) {
+		ret = IS_ERR(uio_class->class);
+		printk(KERN_ERR "class_create failed for uio\n");
+		goto err_class_create;
+	}
+	return 0;
+
+err_class_create:
+	kfree(uio_class);
+	uio_class = NULL;
+err_kzalloc:
+	uio_major_cleanup();
+exit:
+	return ret;
+}
+
+static void release_uio_class(struct kref *kref)
+{
+	/* Ok, we cheat as we know we only have one uio_class */
+	class_destroy(uio_class->class);
+	kfree(uio_class);
+	uio_major_cleanup();
+	uio_class = NULL;
+}
+
+static void uio_class_destroy(void)
+{
+	if (uio_class)
+		kref_put(&uio_class->kref, release_uio_class);
+}
+
+/**
+ * uio_register_device - register a new userspace IO device
+ * @owner:	module that creates the new device
+ * @parent:	parent device
+ * @info:	UIO device capabilities
+ *
+ * returns zero on success or a negative error code.
+ */
+int __uio_register_device(struct module *owner,
+			  struct device *parent,
+			  struct uio_info *info)
+{
+	struct uio_device *idev;
+	int ret = 0;
+
+	if (!parent || !info || !info->name || !info->version)
+		return -EINVAL;
+
+	info->uio_dev = NULL;
+
+	ret = init_uio_class();
+	if (ret)
+		return ret;
+
+	idev = kzalloc(sizeof(*idev), GFP_KERNEL);
+	if (!idev) {
+		ret = -ENOMEM;
+		goto err_kzalloc;
+	}
+
+	idev->owner = owner;
+	idev->info = info;
+	init_waitqueue_head(&idev->wait);
+	atomic_set(&idev->event, 0);
+
+	ret = uio_get_minor(idev);
+	if (ret)
+		goto err_get_minor;
+
+	idev->dev = device_create(uio_class->class, parent,
+				  MKDEV(uio_major, idev->minor),
+				  "uio%d", idev->minor);
+	if (IS_ERR(idev->dev)) {
+		printk(KERN_ERR "UIO: device register failed\n");
+		ret = PTR_ERR(idev->dev);
+		goto err_device_create;
+	}
+	dev_set_drvdata(idev->dev, idev);
+
+	ret = uio_dev_add_attributes(idev);
+	if (ret)
+		goto err_uio_dev_add_attributes;
+
+	info->uio_dev = idev;
+
+	if (idev->info->irq >= 0) {
+		ret = request_irq(idev->info->irq, uio_interrupt,
+				  idev->info->irq_flags, idev->info->name, idev);
+		if (ret)
+			goto err_request_irq;
+	}
+
+	return 0;
+
+err_request_irq:
+	uio_dev_del_attributes(idev);
+err_uio_dev_add_attributes:
+	device_destroy(uio_class->class, MKDEV(uio_major, idev->minor));
+err_device_create:
+	uio_free_minor(idev);
+err_get_minor:
+	kfree(idev);
+err_kzalloc:
+	uio_class_destroy();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__uio_register_device);
+
+/**
+ * uio_unregister_device - unregister a industrial IO device
+ * @info:	UIO device capabilities
+ *
+ */
+void uio_unregister_device(struct uio_info *info)
+{
+	struct uio_device *idev;
+
+	if (!info || !info->uio_dev)
+		return;
+
+	idev = info->uio_dev;
+
+	uio_free_minor(idev);
+
+	if (info->irq >= 0)
+		free_irq(info->irq, idev);
+
+	uio_dev_del_attributes(idev);
+
+	dev_set_drvdata(idev->dev, NULL);
+	device_destroy(uio_class->class, MKDEV(uio_major, idev->minor));
+	kfree(idev);
+	uio_class_destroy();
+
+	return;
+}
+EXPORT_SYMBOL_GPL(uio_unregister_device);
+
+static int __init uio_init(void)
+{
+	return 0;
+}
+
+static void __exit uio_exit(void)
+{
+}
+
+module_init(uio_init)
+module_exit(uio_exit)
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h
new file mode 100644
index 000000000000..44c28e94df50
--- /dev/null
+++ b/include/linux/uio_driver.h
@@ -0,0 +1,91 @@
+/*
+ * include/linux/uio_driver.h
+ *
+ * Copyright(C) 2005, Benedikt Spranger <b.spranger@linutronix.de>
+ * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2006, Hans J. Koch <hjk@linutronix.de>
+ * Copyright(C) 2006, Greg Kroah-Hartman <greg@kroah.com>
+ *
+ * Userspace IO driver.
+ *
+ * Licensed under the GPLv2 only.
+ */
+
+#ifndef _UIO_DRIVER_H_
+#define _UIO_DRIVER_H_
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/interrupt.h>
+
+/**
+ * struct uio_mem - description of a UIO memory region
+ * @kobj:		kobject for this mapping
+ * @addr:		address of the device's memory
+ * @size:		size of IO
+ * @memtype:		type of memory addr points to
+ * @internal_addr:	ioremap-ped version of addr, for driver internal use
+ */
+struct uio_mem {
+	struct kobject		kobj;
+	unsigned long		addr;
+	unsigned long		size;
+	int			memtype;
+	void __iomem		*internal_addr;
+};
+
+#define MAX_UIO_MAPS 	5
+
+struct uio_device;
+
+/**
+ * struct uio_info - UIO device capabilities
+ * @uio_dev:		the UIO device this info belongs to
+ * @name:		device name
+ * @version:		device driver version
+ * @mem:		list of mappable memory regions, size==0 for end of list
+ * @irq:		interrupt number or UIO_IRQ_CUSTOM
+ * @irq_flags:		flags for request_irq()
+ * @priv:		optional private data
+ * @handler:		the device's irq handler
+ * @mmap:		mmap operation for this uio device
+ * @open:		open operation for this uio device
+ * @release:		release operation for this uio device
+ */
+struct uio_info {
+	struct uio_device	*uio_dev;
+	char			*name;
+	char			*version;
+	struct uio_mem		mem[MAX_UIO_MAPS];
+	long			irq;
+	unsigned long		irq_flags;
+	void			*priv;
+	irqreturn_t (*handler)(int irq, struct uio_info *dev_info);
+	int (*mmap)(struct uio_info *info, struct vm_area_struct *vma);
+	int (*open)(struct uio_info *info, struct inode *inode);
+	int (*release)(struct uio_info *info, struct inode *inode);
+};
+
+extern int __must_check
+	__uio_register_device(struct module *owner,
+			      struct device *parent,
+			      struct uio_info *info);
+static inline int __must_check
+	uio_register_device(struct device *parent, struct uio_info *info)
+{
+	return __uio_register_device(THIS_MODULE, parent, info);
+}
+extern void uio_unregister_device(struct uio_info *info);
+extern void uio_event_notify(struct uio_info *info);
+
+/* defines for uio_device->irq */
+#define UIO_IRQ_CUSTOM	-1
+#define UIO_IRQ_NONE	-2
+
+/* defines for uio_device->memtype */
+#define UIO_MEM_NONE	0
+#define UIO_MEM_PHYS	1
+#define UIO_MEM_LOGICAL	2
+#define UIO_MEM_VIRTUAL 3
+
+#endif /* _LINUX_UIO_DRIVER_H_ */
-- 
cgit v1.3-8-gc7d7


From a9933cea7a1d80dd9efae9f1acd857f5dce742b9 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Thu, 7 Jun 2007 17:09:49 -0400
Subject: locks: rename lease functions to reflect locks.c conventions

We've been using the convention that vfs_foo is the function that calls
a filesystem-specific foo method if it exists, or falls back on a
generic method if it doesn't; thus vfs_foo is what is called when some
other part of the kernel (normally lockd or nfsd) wants to get a lock,
whereas foo is what filesystems call to use the underlying local
functionality as part of their lock implementation.

So rename setlease to vfs_setlease (which will call a
filesystem-specific setlease after a later patch) and __setlease to
setlease.

Also, vfs_setlease need only be GPL-exported as long as it's only needed
by lockd and nfsd.

Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
---
 fs/locks.c          | 15 +++++++--------
 fs/nfsd/nfs4state.c | 10 +++++-----
 include/linux/fs.h  |  2 +-
 3 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index 0e5873b0be54..a65d85c1fdc2 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1326,7 +1326,7 @@ int fcntl_getlease(struct file *filp)
 }
 
 /**
- *	__setlease	-	sets a lease on an open file
+ *	setlease	-	sets a lease on an open file
  *	@filp: file pointer
  *	@arg: type of lease to obtain
  *	@flp: input - file_lock to use, output - file_lock inserted
@@ -1336,7 +1336,7 @@ int fcntl_getlease(struct file *filp)
  *
  *	Called with kernel lock held.
  */
-static int __setlease(struct file *filp, long arg, struct file_lock **flp)
+static int setlease(struct file *filp, long arg, struct file_lock **flp)
 {
 	struct file_lock *fl, **before, **my_before = NULL, *lease;
 	struct dentry *dentry = filp->f_path.dentry;
@@ -1423,7 +1423,7 @@ out:
 }
 
  /**
- *	setlease        -       sets a lease on an open file
+ *	vfs_setlease        -       sets a lease on an open file
  *	@filp: file pointer
  *	@arg: type of lease to obtain
  *	@lease: file_lock to use
@@ -1432,18 +1432,17 @@ out:
  *	The fl_lmops fl_break function is required by break_lease
  */
 
-int setlease(struct file *filp, long arg, struct file_lock **lease)
+int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
 {
 	int error;
 
 	lock_kernel();
-	error = __setlease(filp, arg, lease);
+	error = setlease(filp, arg, lease);
 	unlock_kernel();
 
 	return error;
 }
-
-EXPORT_SYMBOL(setlease);
+EXPORT_SYMBOL_GPL(vfs_setlease);
 
 /**
  *	fcntl_setlease	-	sets a lease on an open file
@@ -1469,7 +1468,7 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 
 	lock_kernel();
 
-	error = __setlease(filp, arg, &flp);
+	error = vfs_setlease(filp, arg, &flp);
 	if (error || arg == F_UNLCK)
 		goto out_unlock;
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e4a4c87ec8c6..6284807bd37e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -256,7 +256,7 @@ nfs4_close_delegation(struct nfs4_delegation *dp)
 	/* The following nfsd_close may not actually close the file,
 	 * but we want to remove the lease in any case. */
 	if (dp->dl_flock)
-		setlease(filp, F_UNLCK, &dp->dl_flock);
+		vfs_setlease(filp, F_UNLCK, &dp->dl_flock);
 	nfsd_close(filp);
 }
 
@@ -1402,7 +1402,7 @@ void nfsd_release_deleg_cb(struct file_lock *fl)
 /*
  * Set the delegation file_lock back pointer.
  *
- * Called from __setlease() with lock_kernel() held.
+ * Called from setlease() with lock_kernel() held.
  */
 static
 void nfsd_copy_lock_deleg_cb(struct file_lock *new, struct file_lock *fl)
@@ -1416,7 +1416,7 @@ void nfsd_copy_lock_deleg_cb(struct file_lock *new, struct file_lock *fl)
 }
 
 /*
- * Called from __setlease() with lock_kernel() held
+ * Called from setlease() with lock_kernel() held
  */
 static
 int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try)
@@ -1716,10 +1716,10 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 	fl.fl_file = stp->st_vfs_file;
 	fl.fl_pid = current->tgid;
 
-	/* setlease checks to see if delegation should be handed out.
+	/* vfs_setlease checks to see if delegation should be handed out.
 	 * the lock_manager callbacks fl_mylease and fl_change are used
 	 */
-	if ((status = setlease(stp->st_vfs_file,
+	if ((status = vfs_setlease(stp->st_vfs_file,
 		flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK, &flp))) {
 		dprintk("NFSD: setlease failed [%d], no delegation\n", status);
 		unhash_delegation(dp);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98205f680476..a24f029accc0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -872,7 +872,7 @@ extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
 extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl);
 extern int __break_lease(struct inode *inode, unsigned int flags);
 extern void lease_get_mtime(struct inode *, struct timespec *time);
-extern int setlease(struct file *, long, struct file_lock **);
+extern int vfs_setlease(struct file *, long, struct file_lock **);
 extern int lease_modify(struct file_lock **, int);
 extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
 extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
-- 
cgit v1.3-8-gc7d7


From f9ffed26d6f3e6ac9988947242821579d615fda7 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 14 Nov 2006 15:51:40 -0500
Subject: locks: provide a file lease method enabling cluster-coherent leases

Currently leases are only kept locally, so there's no way for a distributed
filesystem to enforce them against multiple clients.  We're particularly
interested in the case of nfsd exporting a cluster filesystem, in which
case nfsd needs cluster-coherent leases in order to implement delegations
correctly.

Also add some documentation.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/locks.c         | 24 ++++++++++++++++++++++--
 include/linux/fs.h |  1 +
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index a65d85c1fdc2..94f5d8065e3a 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1429,7 +1429,24 @@ out:
  *	@lease: file_lock to use
  *
  *	Call this to establish a lease on the file.
- *	The fl_lmops fl_break function is required by break_lease
+ *	The (*lease)->fl_lmops->fl_break operation must be set; if not,
+ *	break_lease will oops!
+ *
+ *	This will call the filesystem's setlease file method, if
+ *	defined.  Note that there is no getlease method; instead, the
+ *	filesystem setlease method should call back to setlease() to
+ *	add a lease to the inode's lease list, where fcntl_getlease() can
+ *	find it.  Since fcntl_getlease() only reports whether the current
+ *	task holds a lease, a cluster filesystem need only do this for
+ *	leases held by processes on this node.
+ *
+ *	There is also no break_lease method; filesystems that
+ *	handle their own leases shoud break leases themselves from the
+ *	filesystem's open, create, and (on truncate) setattr methods.
+ *
+ *	Warning: the only current setlease methods exist only to disable
+ *	leases in certain cases.  More vfs changes may be required to
+ *	allow a full filesystem lease implementation.
  */
 
 int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
@@ -1437,7 +1454,10 @@ int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
 	int error;
 
 	lock_kernel();
-	error = setlease(filp, arg, lease);
+	if (filp->f_op && filp->f_op->setlease)
+		error = filp->f_op->setlease(filp, arg, lease);
+	else
+		error = setlease(filp, arg, lease);
 	unlock_kernel();
 
 	return error;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a24f029accc0..c8ddf34e9710 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1122,6 +1122,7 @@ struct file_operations {
 	int (*flock) (struct file *, int, struct file_lock *);
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
+	int (*setlease)(struct file *, long, struct file_lock **);
 };
 
 struct inode_operations {
-- 
cgit v1.3-8-gc7d7


From 4698afe8e3a725576366f86560a8a8242b21b9f7 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 4 Jul 2007 17:21:37 -0400
Subject: locks: export setlease to filesystems

Export setlease so it can used by filesystems to implement their lease
methods.

Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
---
 fs/locks.c         | 3 ++-
 include/linux/fs.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index 94f5d8065e3a..4c73b857dded 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1336,7 +1336,7 @@ int fcntl_getlease(struct file *filp)
  *
  *	Called with kernel lock held.
  */
-static int setlease(struct file *filp, long arg, struct file_lock **flp)
+int setlease(struct file *filp, long arg, struct file_lock **flp)
 {
 	struct file_lock *fl, **before, **my_before = NULL, *lease;
 	struct dentry *dentry = filp->f_path.dentry;
@@ -1421,6 +1421,7 @@ static int setlease(struct file *filp, long arg, struct file_lock **flp)
 out:
 	return error;
 }
+EXPORT_SYMBOL(setlease);
 
  /**
  *	vfs_setlease        -       sets a lease on an open file
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c8ddf34e9710..b188c2e5338d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -872,6 +872,7 @@ extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
 extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl);
 extern int __break_lease(struct inode *inode, unsigned int flags);
 extern void lease_get_mtime(struct inode *, struct timespec *time);
+extern int setlease(struct file *, long, struct file_lock **);
 extern int vfs_setlease(struct file *, long, struct file_lock **);
 extern int lease_modify(struct file_lock **, int);
 extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
-- 
cgit v1.3-8-gc7d7


From 6d34ac199a4af5c678a3a8f3275aeb2586b72da3 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 11 May 2007 16:09:32 -0400
Subject: locks: make posix_test_lock() interface more consistent

Since posix_test_lock(), like fcntl() and ->lock(), indicates absence or
presence of a conflict lock by setting fl_type to, respectively, F_UNLCK
or something other than F_UNLCK, the return value is no longer needed.

Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
---
 fs/locks.c         | 10 ++++------
 fs/nfs/file.c      |  4 +++-
 include/linux/fs.h |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index 4c73b857dded..4a8072736efa 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -659,7 +659,7 @@ static int locks_block_on_timeout(struct file_lock *blocker, struct file_lock *w
 	return result;
 }
 
-int
+void
 posix_test_lock(struct file *filp, struct file_lock *fl)
 {
 	struct file_lock *cfl;
@@ -671,14 +671,12 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 		if (posix_locks_conflict(cfl, fl))
 			break;
 	}
-	if (cfl) {
+	if (cfl)
 		__locks_copy_lock(fl, cfl);
-		unlock_kernel();
-		return 1;
-	} else
+	else
 		fl->fl_type = F_UNLCK;
 	unlock_kernel();
-	return 0;
+	return;
 }
 
 EXPORT_SYMBOL(posix_test_lock);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 13ac6fa2b62b..c87dc713b5d7 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -402,7 +402,9 @@ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
 
 	lock_kernel();
 	/* Try local locking first */
-	if (posix_test_lock(filp, fl)) {
+	posix_test_lock(filp, fl);
+	if (fl->fl_type != F_UNLCK) {
+		/* found a conflict */
 		goto out;
 	}
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b188c2e5338d..80deaaf1b746 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -862,7 +862,7 @@ extern void locks_init_lock(struct file_lock *);
 extern void locks_copy_lock(struct file_lock *, struct file_lock *);
 extern void locks_remove_posix(struct file *, fl_owner_t);
 extern void locks_remove_flock(struct file *);
-extern int posix_test_lock(struct file *, struct file_lock *);
+extern void posix_test_lock(struct file *, struct file_lock *);
 extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
 extern int posix_lock_file_wait(struct file *, struct file_lock *);
 extern int posix_unblock_lock(struct file *, struct file_lock *);
-- 
cgit v1.3-8-gc7d7


From 5417169026c3df151adf5a65eb061278b0a72e69 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Thu, 19 Jul 2007 17:39:55 +1000
Subject: [FS] Implement block_page_mkwrite.

Many filesystems need a ->page-mkwrite callout to correctly
set up pages that have been written to by mmap. This is especially
important when mmap is writing into holes as it allows filesystems
to correctly account for and allocate space before the mmap
write is allowed to proceed.

Protection against truncate races is provided by locking the page
and checking to see whether the page mapping is correct and whether
it is beyond EOF so we don't end up allowing allocations beyond
the current EOF or changing EOF as a result of a mmap write.

SGI-PV: 940392
SGI-Modid: 2.6.x-xfs-melb:linux:29146a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
---
 fs/buffer.c                 | 47 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/buffer_head.h |  2 ++
 2 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 0f9006714230..02ebb1f1d3b0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2194,6 +2194,52 @@ int generic_commit_write(struct file *file, struct page *page,
 	return 0;
 }
 
+/*
+ * block_page_mkwrite() is not allowed to change the file size as it gets
+ * called from a page fault handler when a page is first dirtied. Hence we must
+ * be careful to check for EOF conditions here. We set the page up correctly
+ * for a written page which means we get ENOSPC checking when writing into
+ * holes and correct delalloc and unwritten extent mapping on filesystems that
+ * support these features.
+ *
+ * We are not allowed to take the i_mutex here so we have to play games to
+ * protect against truncate races as the page could now be beyond EOF.  Because
+ * vmtruncate() writes the inode size before removing pages, once we have the
+ * page lock we can determine safely if the page is beyond EOF. If it is not
+ * beyond EOF, then the page is guaranteed safe against truncation until we
+ * unlock the page.
+ */
+int
+block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+		   get_block_t get_block)
+{
+	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	unsigned long end;
+	loff_t size;
+	int ret = -EINVAL;
+
+	lock_page(page);
+	size = i_size_read(inode);
+	if ((page->mapping != inode->i_mapping) ||
+	    ((page->index << PAGE_CACHE_SHIFT) > size)) {
+		/* page got truncated out from underneath us */
+		goto out_unlock;
+	}
+
+	/* page is wholly or partially inside EOF */
+	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
+		end = size & ~PAGE_CACHE_MASK;
+	else
+		end = PAGE_CACHE_SIZE;
+
+	ret = block_prepare_write(page, 0, end, get_block);
+	if (!ret)
+		ret = block_commit_write(page, 0, end);
+
+out_unlock:
+	unlock_page(page);
+	return ret;
+}
 
 /*
  * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
@@ -2977,6 +3023,7 @@ EXPORT_SYMBOL(__brelse);
 EXPORT_SYMBOL(__wait_on_buffer);
 EXPORT_SYMBOL(block_commit_write);
 EXPORT_SYMBOL(block_prepare_write);
+EXPORT_SYMBOL(block_page_mkwrite);
 EXPORT_SYMBOL(block_read_full_page);
 EXPORT_SYMBOL(block_sync_page);
 EXPORT_SYMBOL(block_truncate_page);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 5c6e12853a9b..35cadad84b14 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -209,6 +209,8 @@ int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
 int generic_cont_expand(struct inode *inode, loff_t size);
 int generic_cont_expand_simple(struct inode *inode, loff_t size);
 int block_commit_write(struct page *page, unsigned from, unsigned to);
+int block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+				get_block_t get_block);
 void block_sync_page(struct page *);
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
 int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
-- 
cgit v1.3-8-gc7d7


From d00806b183152af6d24f46f0c33f14162ca1262a Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Thu, 19 Jul 2007 01:46:57 -0700
Subject: mm: fix fault vs invalidate race for linear mappings

Fix the race between invalidate_inode_pages and do_no_page.

Andrea Arcangeli identified a subtle race between invalidation of pages from
pagecache with userspace mappings, and do_no_page.

The issue is that invalidation has to shoot down all mappings to the page,
before it can be discarded from the pagecache.  Between shooting down ptes to
a particular page, and actually dropping the struct page from the pagecache,
do_no_page from any process might fault on that page and establish a new
mapping to the page just before it gets discarded from the pagecache.

The most common case where such invalidation is used is in file truncation.
This case was catered for by doing a sort of open-coded seqlock between the
file's i_size, and its truncate_count.

Truncation will decrease i_size, then increment truncate_count before
unmapping userspace pages; do_no_page will read truncate_count, then find the
page if it is within i_size, and then check truncate_count under the page
table lock and back out and retry if it had subsequently been changed (ptl
will serialise against unmapping, and ensure a potentially updated
truncate_count is actually visible).

Complexity and documentation issues aside, the locking protocol fails in the
case where we would like to invalidate pagecache inside i_size.  do_no_page
can come in anytime and filemap_nopage is not aware of the invalidation in
progress (as it is when it is outside i_size).  The end result is that
dangling (->mapping == NULL) pages that appear to be from a particular file
may be mapped into userspace with nonsense data.  Valid mappings to the same
place will see a different page.

Andrea implemented two working fixes, one using a real seqlock, another using
a page->flags bit.  He also proposed using the page lock in do_no_page, but
that was initially considered too heavyweight.  However, it is not a global or
per-file lock, and the page cacheline is modified in do_no_page to increment
_count and _mapcount anyway, so a further modification should not be a large
performance hit.  Scalability is not an issue.

This patch implements this latter approach.  ->nopage implementations return
with the page locked if it is possible for their underlying file to be
invalidated (in that case, they must set a special vm_flags bit to indicate
so).  do_no_page only unlocks the page after setting up the mapping
completely.  invalidation is excluded because it holds the page lock during
invalidation of each page (and ensures that the page is not mapped while
holding the lock).

This also allows significant simplifications in do_no_page, because we have
the page locked in the right place in the pagecache from the start.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/gfs2/ops_file.c          |   2 +
 fs/gfs2/ops_vm.c            |   2 +
 fs/ncpfs/mmap.c             |   1 +
 fs/ocfs2/mmap.c             |   1 +
 fs/xfs/linux-2.6/xfs_file.c |   1 +
 include/linux/mm.h          |   6 ++
 mm/filemap.c                |  53 ++++++---------
 mm/memory.c                 | 153 +++++++++++++++++++++-----------------------
 mm/shmem.c                  |  11 +++-
 mm/truncate.c               |  13 +++-
 10 files changed, 127 insertions(+), 116 deletions(-)

(limited to 'include/linux')

diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 1a5e8e893d75..bad0b24cb773 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -364,6 +364,8 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 	else
 		vma->vm_ops = &gfs2_vm_ops_private;
 
+	vma->vm_flags |= VM_CAN_INVALIDATE;
+
 	gfs2_glock_dq_uninit(&i_gh);
 
 	return error;
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
index 404b7cc9f8c4..d5a98cbfebdc 100644
--- a/fs/gfs2/ops_vm.c
+++ b/fs/gfs2/ops_vm.c
@@ -138,6 +138,8 @@ static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
 	if (alloc_required) {
 		error = alloc_page_backing(ip, result);
 		if (error) {
+			if (area->vm_flags & VM_CAN_INVALIDATE)
+				unlock_page(result);
 			page_cache_release(result);
 			result = NULL;
 			goto out;
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 70a69115500f..5416673418b8 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -123,6 +123,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EFBIG;
 
 	vma->vm_ops = &ncp_file_mmap;
+	vma->vm_flags |= VM_CAN_INVALIDATE;
 	file_accessed(file);
 	return 0;
 }
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index d79aa12137d2..904f39ff5340 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -226,6 +226,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
 	ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level);
 out:
 	vma->vm_ops = &ocfs2_file_vm_ops;
+	vma->vm_flags |= VM_CAN_INVALIDATE;
 	return 0;
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index cbcd40c8c2a0..92b2f225712f 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -310,6 +310,7 @@ xfs_file_mmap(
 	struct vm_area_struct *vma)
 {
 	vma->vm_ops = &xfs_file_vm_ops;
+	vma->vm_flags |= VM_CAN_INVALIDATE;
 
 #ifdef CONFIG_XFS_DMAPI
 	if (vn_from_inode(filp->f_path.dentry->d_inode)->v_vfsp->vfs_flag & VFS_DMI)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a5c451816fdc..ca9536a348c8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -168,6 +168,12 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_INSERTPAGE	0x02000000	/* The vma has had "vm_insert_page()" done on it */
 #define VM_ALWAYSDUMP	0x04000000	/* Always include in core dumps */
 
+#define VM_CAN_INVALIDATE 0x08000000	/* The mapping may be invalidated,
+					 * eg. truncate or invalidate_inode_*.
+					 * In this case, do_no_page must
+					 * return with the page locked.
+					 */
+
 #ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
 #endif
diff --git a/mm/filemap.c b/mm/filemap.c
index 5d5449f3d41c..462cda58a18e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1325,9 +1325,10 @@ struct page *filemap_nopage(struct vm_area_struct *area,
 	unsigned long size, pgoff;
 	int did_readaround = 0, majmin = VM_FAULT_MINOR;
 
+	BUG_ON(!(area->vm_flags & VM_CAN_INVALIDATE));
+
 	pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
 
-retry_all:
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (pgoff >= size)
 		goto outside_data_content;
@@ -1349,7 +1350,7 @@ retry_all:
 	 * Do we have something in the page cache already?
 	 */
 retry_find:
-	page = find_get_page(mapping, pgoff);
+	page = find_lock_page(mapping, pgoff);
 	if (!page) {
 		unsigned long ra_pages;
 
@@ -1383,7 +1384,7 @@ retry_find:
 				start = pgoff - ra_pages / 2;
 			do_page_cache_readahead(mapping, file, start, ra_pages);
 		}
-		page = find_get_page(mapping, pgoff);
+		page = find_lock_page(mapping, pgoff);
 		if (!page)
 			goto no_cached_page;
 	}
@@ -1392,13 +1393,19 @@ retry_find:
 		ra->mmap_hit++;
 
 	/*
-	 * Ok, found a page in the page cache, now we need to check
-	 * that it's up-to-date.
+	 * We have a locked page in the page cache, now we need to check
+	 * that it's up-to-date. If not, it is going to be due to an error.
 	 */
-	if (!PageUptodate(page))
+	if (unlikely(!PageUptodate(page)))
 		goto page_not_uptodate;
 
-success:
+	/* Must recheck i_size under page lock */
+	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (unlikely(pgoff >= size)) {
+		unlock_page(page);
+		goto outside_data_content;
+	}
+
 	/*
 	 * Found the page and have a reference on it.
 	 */
@@ -1440,6 +1447,7 @@ no_cached_page:
 	return NOPAGE_SIGBUS;
 
 page_not_uptodate:
+	/* IO error path */
 	if (!did_readaround) {
 		majmin = VM_FAULT_MAJOR;
 		count_vm_event(PGMAJFAULT);
@@ -1451,37 +1459,15 @@ page_not_uptodate:
 	 * because there really aren't any performance issues here
 	 * and we need to check for errors.
 	 */
-	lock_page(page);
-
-	/* Somebody truncated the page on us? */
-	if (!page->mapping) {
-		unlock_page(page);
-		page_cache_release(page);
-		goto retry_all;
-	}
-
-	/* Somebody else successfully read it in? */
-	if (PageUptodate(page)) {
-		unlock_page(page);
-		goto success;
-	}
 	ClearPageError(page);
 	error = mapping->a_ops->readpage(file, page);
-	if (!error) {
-		wait_on_page_locked(page);
-		if (PageUptodate(page))
-			goto success;
-	} else if (error == AOP_TRUNCATED_PAGE) {
-		page_cache_release(page);
+	page_cache_release(page);
+
+	if (!error || error == AOP_TRUNCATED_PAGE)
 		goto retry_find;
-	}
 
-	/*
-	 * Things didn't work out. Return zero to tell the
-	 * mm layer so, possibly freeing the page cache page first.
-	 */
+	/* Things didn't work out. Return zero to tell the mm layer so. */
 	shrink_readahead_size_eio(file, ra);
-	page_cache_release(page);
 	return NOPAGE_SIGBUS;
 }
 EXPORT_SYMBOL(filemap_nopage);
@@ -1674,6 +1660,7 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
 		return -ENOEXEC;
 	file_accessed(file);
 	vma->vm_ops = &generic_file_vm_ops;
+	vma->vm_flags |= VM_CAN_INVALIDATE;
 	return 0;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index 9c6ff7fffdc8..e6c99f6b5649 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1831,6 +1831,13 @@ static int unmap_mapping_range_vma(struct vm_area_struct *vma,
 	unsigned long restart_addr;
 	int need_break;
 
+	/*
+	 * files that support invalidating or truncating portions of the
+	 * file from under mmaped areas must set the VM_CAN_INVALIDATE flag, and
+	 * have their .nopage function return the page locked.
+	 */
+	BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE));
+
 again:
 	restart_addr = vma->vm_truncate_count;
 	if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
@@ -1959,17 +1966,8 @@ void unmap_mapping_range(struct address_space *mapping,
 
 	spin_lock(&mapping->i_mmap_lock);
 
-	/* serialize i_size write against truncate_count write */
-	smp_wmb();
-	/* Protect against page faults, and endless unmapping loops */
+	/* Protect against endless unmapping loops */
 	mapping->truncate_count++;
-	/*
-	 * For archs where spin_lock has inclusive semantics like ia64
-	 * this smp_mb() will prevent to read pagetable contents
-	 * before the truncate_count increment is visible to
-	 * other cpus.
-	 */
-	smp_mb();
 	if (unlikely(is_restart_addr(mapping->truncate_count))) {
 		if (mapping->truncate_count == 0)
 			reset_vma_truncate_counts(mapping);
@@ -2008,8 +2006,18 @@ int vmtruncate(struct inode * inode, loff_t offset)
 	if (IS_SWAPFILE(inode))
 		goto out_busy;
 	i_size_write(inode, offset);
+
+	/*
+	 * unmap_mapping_range is called twice, first simply for efficiency
+	 * so that truncate_inode_pages does fewer single-page unmaps. However
+	 * after this first call, and before truncate_inode_pages finishes,
+	 * it is possible for private pages to be COWed, which remain after
+	 * truncate_inode_pages finishes, hence the second unmap_mapping_range
+	 * call must be made for correctness.
+	 */
 	unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
 	truncate_inode_pages(mapping, offset);
+	unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
 	goto out_truncate;
 
 do_expand:
@@ -2049,6 +2057,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 	down_write(&inode->i_alloc_sem);
 	unmap_mapping_range(mapping, offset, (end - offset), 1);
 	truncate_inode_pages_range(mapping, offset, end);
+	unmap_mapping_range(mapping, offset, (end - offset), 1);
 	inode->i_op->truncate_range(inode, offset, end);
 	up_write(&inode->i_alloc_sem);
 	mutex_unlock(&inode->i_mutex);
@@ -2206,7 +2215,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, pte);
-	lazy_mmu_prot_update(pte);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 out:
@@ -2297,10 +2305,8 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		int write_access)
 {
 	spinlock_t *ptl;
-	struct page *new_page;
-	struct address_space *mapping = NULL;
+	struct page *page, *nopage_page;
 	pte_t entry;
-	unsigned int sequence = 0;
 	int ret = VM_FAULT_MINOR;
 	int anon = 0;
 	struct page *dirty_page = NULL;
@@ -2308,74 +2314,53 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_unmap(page_table);
 	BUG_ON(vma->vm_flags & VM_PFNMAP);
 
-	if (vma->vm_file) {
-		mapping = vma->vm_file->f_mapping;
-		sequence = mapping->truncate_count;
-		smp_rmb(); /* serializes i_size against truncate_count */
-	}
-retry:
-	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
-	/*
-	 * No smp_rmb is needed here as long as there's a full
-	 * spin_lock/unlock sequence inside the ->nopage callback
-	 * (for the pagecache lookup) that acts as an implicit
-	 * smp_mb() and prevents the i_size read to happen
-	 * after the next truncate_count read.
-	 */
-
+	nopage_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
 	/* no page was available -- either SIGBUS, OOM or REFAULT */
-	if (unlikely(new_page == NOPAGE_SIGBUS))
+	if (unlikely(nopage_page == NOPAGE_SIGBUS))
 		return VM_FAULT_SIGBUS;
-	else if (unlikely(new_page == NOPAGE_OOM))
+	else if (unlikely(nopage_page == NOPAGE_OOM))
 		return VM_FAULT_OOM;
-	else if (unlikely(new_page == NOPAGE_REFAULT))
+	else if (unlikely(nopage_page == NOPAGE_REFAULT))
 		return VM_FAULT_MINOR;
 
+	BUG_ON(vma->vm_flags & VM_CAN_INVALIDATE && !PageLocked(nopage_page));
+	/*
+	 * For consistency in subsequent calls, make the nopage_page always
+	 * locked.
+	 */
+	if (unlikely(!(vma->vm_flags & VM_CAN_INVALIDATE)))
+		lock_page(nopage_page);
+
 	/*
 	 * Should we do an early C-O-W break?
 	 */
+	page = nopage_page;
 	if (write_access) {
 		if (!(vma->vm_flags & VM_SHARED)) {
-			struct page *page;
-
-			if (unlikely(anon_vma_prepare(vma)))
-				goto oom;
-			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
-						vma, address);
-			if (!page)
-				goto oom;
-			copy_user_highpage(page, new_page, address, vma);
-			page_cache_release(new_page);
-			new_page = page;
+			if (unlikely(anon_vma_prepare(vma))) {
+				ret = VM_FAULT_OOM;
+				goto out_error;
+			}
+			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+			if (!page) {
+				ret = VM_FAULT_OOM;
+				goto out_error;
+			}
+			copy_user_highpage(page, nopage_page, address, vma);
 			anon = 1;
-
 		} else {
 			/* if the page will be shareable, see if the backing
 			 * address space wants to know that the page is about
 			 * to become writable */
 			if (vma->vm_ops->page_mkwrite &&
-			    vma->vm_ops->page_mkwrite(vma, new_page) < 0
-			    ) {
-				page_cache_release(new_page);
-				return VM_FAULT_SIGBUS;
+			    vma->vm_ops->page_mkwrite(vma, page) < 0) {
+				ret = VM_FAULT_SIGBUS;
+				goto out_error;
 			}
 		}
 	}
 
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-	/*
-	 * For a file-backed vma, someone could have truncated or otherwise
-	 * invalidated this page.  If unmap_mapping_range got called,
-	 * retry getting the page.
-	 */
-	if (mapping && unlikely(sequence != mapping->truncate_count)) {
-		pte_unmap_unlock(page_table, ptl);
-		page_cache_release(new_page);
-		cond_resched();
-		sequence = mapping->truncate_count;
-		smp_rmb();
-		goto retry;
-	}
 
 	/*
 	 * This silly early PAGE_DIRTY setting removes a race
@@ -2388,43 +2373,51 @@ retry:
 	 * handle that later.
 	 */
 	/* Only go through if we didn't race with anybody else... */
-	if (pte_none(*page_table)) {
-		flush_icache_page(vma, new_page);
-		entry = mk_pte(new_page, vma->vm_page_prot);
+	if (likely(pte_none(*page_table))) {
+		flush_icache_page(vma, page);
+		entry = mk_pte(page, vma->vm_page_prot);
 		if (write_access)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
-			inc_mm_counter(mm, anon_rss);
-			lru_cache_add_active(new_page);
-			page_add_new_anon_rmap(new_page, vma, address);
+                        inc_mm_counter(mm, anon_rss);
+                        lru_cache_add_active(page);
+                        page_add_new_anon_rmap(page, vma, address);
 		} else {
 			inc_mm_counter(mm, file_rss);
-			page_add_file_rmap(new_page);
+			page_add_file_rmap(page);
 			if (write_access) {
-				dirty_page = new_page;
+				dirty_page = page;
 				get_page(dirty_page);
 			}
 		}
+
+		/* no need to invalidate: a not-present page won't be cached */
+		update_mmu_cache(vma, address, entry);
+		lazy_mmu_prot_update(entry);
 	} else {
-		/* One of our sibling threads was faster, back out. */
-		page_cache_release(new_page);
-		goto unlock;
+		if (anon)
+			page_cache_release(page);
+		else
+			anon = 1; /* not anon, but release nopage_page */
 	}
 
-	/* no need to invalidate: a not-present page shouldn't be cached */
-	update_mmu_cache(vma, address, entry);
-	lazy_mmu_prot_update(entry);
-unlock:
 	pte_unmap_unlock(page_table, ptl);
-	if (dirty_page) {
+
+out:
+	unlock_page(nopage_page);
+	if (anon)
+		page_cache_release(nopage_page);
+	else if (dirty_page) {
 		set_page_dirty_balance(dirty_page);
 		put_page(dirty_page);
 	}
+
 	return ret;
-oom:
-	page_cache_release(new_page);
-	return VM_FAULT_OOM;
+
+out_error:
+	anon = 1; /* relase nopage_page */
+	goto out;
 }
 
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index 96fa79fb6ad3..5808fadd3944 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -83,6 +83,7 @@ enum sgp_type {
 	SGP_READ,	/* don't exceed i_size, don't allocate page */
 	SGP_CACHE,	/* don't exceed i_size, may allocate page */
 	SGP_WRITE,	/* may exceed i_size, may allocate page */
+	SGP_NOPAGE,	/* same as SGP_CACHE, return with page locked */
 };
 
 static int shmem_getpage(struct inode *inode, unsigned long idx,
@@ -1289,8 +1290,10 @@ repeat:
 	}
 done:
 	if (*pagep != filepage) {
-		unlock_page(filepage);
 		*pagep = filepage;
+		if (sgp != SGP_NOPAGE)
+			unlock_page(filepage);
+
 	}
 	return 0;
 
@@ -1310,13 +1313,15 @@ static struct page *shmem_nopage(struct vm_area_struct *vma,
 	unsigned long idx;
 	int error;
 
+	BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE));
+
 	idx = (address - vma->vm_start) >> PAGE_SHIFT;
 	idx += vma->vm_pgoff;
 	idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
 	if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode))
 		return NOPAGE_SIGBUS;
 
-	error = shmem_getpage(inode, idx, &page, SGP_CACHE, type);
+	error = shmem_getpage(inode, idx, &page, SGP_NOPAGE, type);
 	if (error)
 		return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
 
@@ -1414,6 +1419,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	file_accessed(file);
 	vma->vm_ops = &shmem_vm_ops;
+	vma->vm_flags |= VM_CAN_INVALIDATE;
 	return 0;
 }
 
@@ -2596,5 +2602,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 		fput(vma->vm_file);
 	vma->vm_file = file;
 	vma->vm_ops = &shmem_vm_ops;
+	vma->vm_flags |= VM_CAN_INVALIDATE;
 	return 0;
 }
diff --git a/mm/truncate.c b/mm/truncate.c
index f47e46d1be3b..aed85f0b707f 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -192,6 +192,11 @@ void truncate_inode_pages_range(struct address_space *mapping,
 				unlock_page(page);
 				continue;
 			}
+			if (page_mapped(page)) {
+				unmap_mapping_range(mapping,
+				  (loff_t)page_index<<PAGE_CACHE_SHIFT,
+				  PAGE_CACHE_SIZE, 0);
+			}
 			truncate_complete_page(mapping, page);
 			unlock_page(page);
 		}
@@ -229,6 +234,11 @@ void truncate_inode_pages_range(struct address_space *mapping,
 				break;
 			lock_page(page);
 			wait_on_page_writeback(page);
+			if (page_mapped(page)) {
+				unmap_mapping_range(mapping,
+				  (loff_t)page->index<<PAGE_CACHE_SHIFT,
+				  PAGE_CACHE_SIZE, 0);
+			}
 			if (page->index > next)
 				next = page->index;
 			next++;
@@ -405,7 +415,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 				break;
 			}
 			wait_on_page_writeback(page);
-			while (page_mapped(page)) {
+			if (page_mapped(page)) {
 				if (!did_range_unmap) {
 					/*
 					 * Zap the rest of the file in one hit.
@@ -425,6 +435,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 					  PAGE_CACHE_SIZE, 0);
 				}
 			}
+			BUG_ON(page_mapped(page));
 			ret = do_launder_page(mapping, page);
 			if (ret == 0 && !invalidate_complete_page2(mapping, page))
 				ret = -EIO;
-- 
cgit v1.3-8-gc7d7


From 54cb8821de07f2ffcd28c380ce9b93d5784b40d7 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Thu, 19 Jul 2007 01:46:59 -0700
Subject: mm: merge populate and nopage into fault (fixes nonlinear)

Nonlinear mappings are (AFAIKS) simply a virtual memory concept that encodes
the virtual address -> file offset differently from linear mappings.

->populate is a layering violation because the filesystem/pagecache code
should need to know anything about the virtual memory mapping.  The hitch here
is that the ->nopage handler didn't pass down enough information (ie.  pgoff).
 But it is more logical to pass pgoff rather than have the ->nopage function
calculate it itself anyway (because that's a similar layering violation).

Having the populate handler install the pte itself is likewise a nasty thing
to be doing.

This patch introduces a new fault handler that replaces ->nopage and
->populate and (later) ->nopfn.  Most of the old mechanism is still in place
so there is a lot of duplication and nice cleanups that can be removed if
everyone switches over.

The rationale for doing this in the first place is that nonlinear mappings are
subject to the pagefault vs invalidate/truncate race too, and it seemed stupid
to duplicate the synchronisation logic rather than just consolidate the two.

After this patch, MAP_NONBLOCK no longer sets up ptes for pages present in
pagecache.  Seems like a fringe functionality anyway.

NOPAGE_REFAULT is removed.  This should be implemented with ->fault, and no
users have hit mainline yet.

[akpm@linux-foundation.org: cleanup]
[randy.dunlap@oracle.com: doc. fixes for readahead]
[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/feature-removal-schedule.txt |  27 ++++++
 Documentation/filesystems/Locking          |   2 +
 fs/gfs2/ops_address.c                      |   2 +-
 fs/gfs2/ops_file.c                         |   2 +-
 fs/gfs2/ops_vm.c                           |  36 ++++----
 fs/ncpfs/mmap.c                            |  23 ++---
 fs/ocfs2/aops.c                            |   2 +-
 fs/ocfs2/mmap.c                            |  17 ++--
 fs/xfs/linux-2.6/xfs_file.c                |  23 +++--
 include/linux/mm.h                         |  41 +++++++--
 ipc/shm.c                                  |   9 +-
 mm/filemap.c                               |  94 ++++++++++++--------
 mm/filemap_xip.c                           |  54 ++++++------
 mm/fremap.c                                | 103 +++++++++++++++-------
 mm/memory.c                                | 132 +++++++++++++++++++----------
 mm/mmap.c                                  |   8 +-
 mm/nommu.c                                 |   3 +-
 mm/rmap.c                                  |   4 +-
 mm/shmem.c                                 |  82 +++++-------------
 mm/truncate.c                              |   2 +-
 20 files changed, 394 insertions(+), 272 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 66c8b4b165c1..716568afdff8 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -135,6 +135,33 @@ Who:	Greg Kroah-Hartman <gregkh@suse.de>
 
 ---------------------------
 
+What:	filemap_nopage, filemap_populate
+When:	April 2007
+Why:	These legacy interfaces no longer have any callers in the kernel and
+	any functionality provided can be provided with filemap_fault. The
+	removal schedule is short because they are a big maintainence burden
+	and have some bugs.
+Who:	Nick Piggin <npiggin@suse.de>
+
+---------------------------
+
+What:	vm_ops.populate, install_page
+When:	April 2007
+Why:	These legacy interfaces no longer have any callers in the kernel and
+	any functionality provided can be provided with vm_ops.fault.
+Who:	Nick Piggin <npiggin@suse.de>
+
+---------------------------
+
+What:	vm_ops.nopage
+When:	February 2008, provided in-kernel callers have been converted
+Why:	This interface is replaced by vm_ops.fault, but it has been around
+	forever, is used by a lot of drivers, and doesn't cost much to
+	maintain.
+Who:	Nick Piggin <npiggin@suse.de>
+
+---------------------------
+
 What:	Interrupt only SA_* flags
 When:	September 2007
 Why:	The interrupt related SA_* flags are replaced by IRQF_* to move them
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index d866551be037..970c8ec1a05b 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -510,12 +510,14 @@ More details about quota locking can be found in fs/dquot.c.
 prototypes:
 	void (*open)(struct vm_area_struct*);
 	void (*close)(struct vm_area_struct*);
+	struct page *(*fault)(struct vm_area_struct*, struct fault_data *);
 	struct page *(*nopage)(struct vm_area_struct*, unsigned long, int *);
 
 locking rules:
 		BKL	mmap_sem
 open:		no	yes
 close:		no	yes
+fault:		no	yes
 nopage:		no	yes
 
 ================================================================================
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 26c888890c24..ce90032c010e 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -251,7 +251,7 @@ static int gfs2_readpage(struct file *file, struct page *page)
 		if (file) {
 			gf = file->private_data;
 			if (test_bit(GFF_EXLOCK, &gf->f_flags))
-				/* gfs2_sharewrite_nopage has grabbed the ip->i_gl already */
+				/* gfs2_sharewrite_fault has grabbed the ip->i_gl already */
 				goto skip_lock;
 		}
 		gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index bad0b24cb773..581ac11b2656 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -364,7 +364,7 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 	else
 		vma->vm_ops = &gfs2_vm_ops_private;
 
-	vma->vm_flags |= VM_CAN_INVALIDATE;
+	vma->vm_flags |= VM_CAN_INVALIDATE|VM_CAN_NONLINEAR;
 
 	gfs2_glock_dq_uninit(&i_gh);
 
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
index d5a98cbfebdc..e9fe6eb74e75 100644
--- a/fs/gfs2/ops_vm.c
+++ b/fs/gfs2/ops_vm.c
@@ -27,13 +27,13 @@
 #include "trans.h"
 #include "util.h"
 
-static struct page *gfs2_private_nopage(struct vm_area_struct *area,
-					unsigned long address, int *type)
+static struct page *gfs2_private_fault(struct vm_area_struct *vma,
+					struct fault_data *fdata)
 {
-	struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host);
+	struct gfs2_inode *ip = GFS2_I(vma->vm_file->f_mapping->host);
 
 	set_bit(GIF_PAGED, &ip->i_flags);
-	return filemap_nopage(area, address, type);
+	return filemap_fault(vma, fdata);
 }
 
 static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
@@ -104,16 +104,14 @@ out:
 	return error;
 }
 
-static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
-					   unsigned long address, int *type)
+static struct page *gfs2_sharewrite_fault(struct vm_area_struct *vma,
+						struct fault_data *fdata)
 {
-	struct file *file = area->vm_file;
+	struct file *file = vma->vm_file;
 	struct gfs2_file *gf = file->private_data;
 	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
 	struct gfs2_holder i_gh;
 	struct page *result = NULL;
-	unsigned long index = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) +
-			      area->vm_pgoff;
 	int alloc_required;
 	int error;
 
@@ -124,23 +122,27 @@ static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
 	set_bit(GIF_PAGED, &ip->i_flags);
 	set_bit(GIF_SW_PAGED, &ip->i_flags);
 
-	error = gfs2_write_alloc_required(ip, (u64)index << PAGE_CACHE_SHIFT,
-					  PAGE_CACHE_SIZE, &alloc_required);
-	if (error)
+	error = gfs2_write_alloc_required(ip,
+					(u64)fdata->pgoff << PAGE_CACHE_SHIFT,
+					PAGE_CACHE_SIZE, &alloc_required);
+	if (error) {
+		fdata->type = VM_FAULT_OOM; /* XXX: are these right? */
 		goto out;
+	}
 
 	set_bit(GFF_EXLOCK, &gf->f_flags);
-	result = filemap_nopage(area, address, type);
+	result = filemap_fault(vma, fdata);
 	clear_bit(GFF_EXLOCK, &gf->f_flags);
-	if (!result || result == NOPAGE_OOM)
+	if (!result)
 		goto out;
 
 	if (alloc_required) {
 		error = alloc_page_backing(ip, result);
 		if (error) {
-			if (area->vm_flags & VM_CAN_INVALIDATE)
+			if (vma->vm_flags & VM_CAN_INVALIDATE)
 				unlock_page(result);
 			page_cache_release(result);
+			fdata->type = VM_FAULT_OOM;
 			result = NULL;
 			goto out;
 		}
@@ -154,10 +156,10 @@ out:
 }
 
 struct vm_operations_struct gfs2_vm_ops_private = {
-	.nopage = gfs2_private_nopage,
+	.fault = gfs2_private_fault,
 };
 
 struct vm_operations_struct gfs2_vm_ops_sharewrite = {
-	.nopage = gfs2_sharewrite_nopage,
+	.fault = gfs2_sharewrite_fault,
 };
 
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 5416673418b8..af48b792ca04 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -25,8 +25,8 @@
 /*
  * Fill in the supplied page for mmap
  */
-static struct page* ncp_file_mmap_nopage(struct vm_area_struct *area,
-				     unsigned long address, int *type)
+static struct page* ncp_file_mmap_fault(struct vm_area_struct *area,
+						struct fault_data *fdata)
 {
 	struct file *file = area->vm_file;
 	struct dentry *dentry = file->f_path.dentry;
@@ -40,15 +40,17 @@ static struct page* ncp_file_mmap_nopage(struct vm_area_struct *area,
 
 	page = alloc_page(GFP_HIGHUSER); /* ncpfs has nothing against high pages
 	           as long as recvmsg and memset works on it */
-	if (!page)
-		return page;
+	if (!page) {
+		fdata->type = VM_FAULT_OOM;
+		return NULL;
+	}
 	pg_addr = kmap(page);
-	address &= PAGE_MASK;
-	pos = address - area->vm_start + (area->vm_pgoff << PAGE_SHIFT);
+	pos = fdata->pgoff << PAGE_SHIFT;
 
 	count = PAGE_SIZE;
-	if (address + PAGE_SIZE > area->vm_end) {
-		count = area->vm_end - address;
+	if (fdata->address + PAGE_SIZE > area->vm_end) {
+		WARN_ON(1); /* shouldn't happen? */
+		count = area->vm_end - fdata->address;
 	}
 	/* what we can read in one go */
 	bufsize = NCP_SERVER(inode)->buffer_size;
@@ -91,15 +93,14 @@ static struct page* ncp_file_mmap_nopage(struct vm_area_struct *area,
 	 * fetches from the network, here the analogue of disk.
 	 * -- wli
 	 */
-	if (type)
-		*type = VM_FAULT_MAJOR;
+	fdata->type = VM_FAULT_MAJOR;
 	count_vm_event(PGMAJFAULT);
 	return page;
 }
 
 static struct vm_operations_struct ncp_file_mmap =
 {
-	.nopage	= ncp_file_mmap_nopage,
+	.fault = ncp_file_mmap_fault,
 };
 
 
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 84bf6e79de23..460d440310f2 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -232,7 +232,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
 	 * might now be discovering a truncate that hit on another node.
 	 * block_read_full_page->get_block freaks out if it is asked to read
 	 * beyond the end of a file, so we check here.  Callers
-	 * (generic_file_read, fault->nopage) are clever enough to check i_size
+	 * (generic_file_read, vm_ops->fault) are clever enough to check i_size
 	 * and notice that the page they just read isn't needed.
 	 *
 	 * XXX sys_readahead() seems to get that wrong?
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 904f39ff5340..cd75508b1c8a 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -60,24 +60,23 @@ static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
 	return sigprocmask(SIG_SETMASK, oldset, NULL);
 }
 
-static struct page *ocfs2_nopage(struct vm_area_struct * area,
-				 unsigned long address,
-				 int *type)
+static struct page *ocfs2_fault(struct vm_area_struct *area,
+						struct fault_data *fdata)
 {
-	struct page *page = NOPAGE_SIGBUS;
+	struct page *page = NULL;
 	sigset_t blocked, oldset;
 	int ret;
 
-	mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address,
-		   type);
+	mlog_entry("(area=%p, page offset=%lu)\n", area, fdata->pgoff);
 
 	ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
 	if (ret < 0) {
+		fdata->type = VM_FAULT_SIGBUS;
 		mlog_errno(ret);
 		goto out;
 	}
 
-	page = filemap_nopage(area, address, type);
+	page = filemap_fault(area, fdata);
 
 	ret = ocfs2_vm_op_unblock_sigs(&oldset);
 	if (ret < 0)
@@ -209,7 +208,7 @@ out:
 }
 
 static struct vm_operations_struct ocfs2_file_vm_ops = {
-	.nopage		= ocfs2_nopage,
+	.fault		= ocfs2_fault,
 	.page_mkwrite	= ocfs2_page_mkwrite,
 };
 
@@ -226,7 +225,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
 	ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level);
 out:
 	vma->vm_ops = &ocfs2_file_vm_ops;
-	vma->vm_flags |= VM_CAN_INVALIDATE;
+	vma->vm_flags |= VM_CAN_INVALIDATE | VM_CAN_NONLINEAR;
 	return 0;
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 92b2f225712f..f12e80a69c68 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -213,18 +213,19 @@ xfs_file_fsync(
 
 #ifdef CONFIG_XFS_DMAPI
 STATIC struct page *
-xfs_vm_nopage(
-	struct vm_area_struct	*area,
-	unsigned long		address,
-	int			*type)
+xfs_vm_fault(
+	struct vm_area_struct	*vma,
+	struct fault_data	*fdata)
 {
-	struct inode	*inode = area->vm_file->f_path.dentry->d_inode;
+	struct inode	*inode = vma->vm_file->f_path.dentry->d_inode;
 	bhv_vnode_t	*vp = vn_from_inode(inode);
 
 	ASSERT_ALWAYS(vp->v_vfsp->vfs_flag & VFS_DMI);
-	if (XFS_SEND_MMAP(XFS_VFSTOM(vp->v_vfsp), area, 0))
+	if (XFS_SEND_MMAP(XFS_VFSTOM(vp->v_vfsp), vma, 0)) {
+		fdata->type = VM_FAULT_SIGBUS;
 		return NULL;
-	return filemap_nopage(area, address, type);
+	}
+	return filemap_fault(vma, fdata);
 }
 #endif /* CONFIG_XFS_DMAPI */
 
@@ -310,7 +311,7 @@ xfs_file_mmap(
 	struct vm_area_struct *vma)
 {
 	vma->vm_ops = &xfs_file_vm_ops;
-	vma->vm_flags |= VM_CAN_INVALIDATE;
+	vma->vm_flags |= VM_CAN_INVALIDATE | VM_CAN_NONLINEAR;
 
 #ifdef CONFIG_XFS_DMAPI
 	if (vn_from_inode(filp->f_path.dentry->d_inode)->v_vfsp->vfs_flag & VFS_DMI)
@@ -465,14 +466,12 @@ const struct file_operations xfs_dir_file_operations = {
 };
 
 static struct vm_operations_struct xfs_file_vm_ops = {
-	.nopage		= filemap_nopage,
-	.populate	= filemap_populate,
+	.fault		= filemap_fault,
 };
 
 #ifdef CONFIG_XFS_DMAPI
 static struct vm_operations_struct xfs_dmapi_file_vm_ops = {
-	.nopage		= xfs_vm_nopage,
-	.populate	= filemap_populate,
+	.fault		= xfs_vm_fault,
 #ifdef HAVE_VMOP_MPROTECT
 	.mprotect	= xfs_vm_mprotect,
 #endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ca9536a348c8..f28a1b3e63a9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -173,6 +173,7 @@ extern unsigned int kobjsize(const void *objp);
 					 * In this case, do_no_page must
 					 * return with the page locked.
 					 */
+#define VM_CAN_NONLINEAR 0x10000000	/* Has ->fault & does nonlinear pages */
 
 #ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
@@ -196,6 +197,25 @@ extern unsigned int kobjsize(const void *objp);
  */
 extern pgprot_t protection_map[16];
 
+#define FAULT_FLAG_WRITE	0x01
+#define FAULT_FLAG_NONLINEAR	0x02
+
+/*
+ * fault_data is filled in the the pagefault handler and passed to the
+ * vma's ->fault function. That function is responsible for filling in
+ * 'type', which is the type of fault if a page is returned, or the type
+ * of error if NULL is returned.
+ *
+ * pgoff should be used in favour of address, if possible. If pgoff is
+ * used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get
+ * nonlinear mapping support.
+ */
+struct fault_data {
+	unsigned long address;
+	pgoff_t pgoff;
+	unsigned int flags;
+	int type;
+};
 
 /*
  * These are the virtual MM functions - opening of an area, closing and
@@ -205,9 +225,15 @@ extern pgprot_t protection_map[16];
 struct vm_operations_struct {
 	void (*open)(struct vm_area_struct * area);
 	void (*close)(struct vm_area_struct * area);
-	struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type);
-	unsigned long (*nopfn)(struct vm_area_struct * area, unsigned long address);
-	int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
+	struct page *(*fault)(struct vm_area_struct *vma,
+			struct fault_data *fdata);
+	struct page *(*nopage)(struct vm_area_struct *area,
+			unsigned long address, int *type);
+	unsigned long (*nopfn)(struct vm_area_struct *area,
+			unsigned long address);
+	int (*populate)(struct vm_area_struct *area, unsigned long address,
+			unsigned long len, pgprot_t prot, unsigned long pgoff,
+			int nonblock);
 
 	/* notification that a previously read-only page is about to become
 	 * writable, if an error is returned it will cause a SIGBUS */
@@ -661,7 +687,6 @@ static inline int page_mapped(struct page *page)
  */
 #define NOPAGE_SIGBUS	(NULL)
 #define NOPAGE_OOM	((struct page *) (-1))
-#define NOPAGE_REFAULT	((struct page *) (-2))	/* Return to userspace, rerun */
 
 /*
  * Error return values for the *_nopfn functions
@@ -1110,9 +1135,11 @@ extern void truncate_inode_pages_range(struct address_space *,
 				       loff_t lstart, loff_t lend);
 
 /* generic vm_area_ops exported for stackable file systems */
-extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);
-extern int filemap_populate(struct vm_area_struct *, unsigned long,
-		unsigned long, pgprot_t, unsigned long, int);
+extern struct page *filemap_fault(struct vm_area_struct *, struct fault_data *);
+extern struct page * __deprecated_for_modules
+filemap_nopage(struct vm_area_struct *, unsigned long, int *);
+extern int __deprecated_for_modules filemap_populate(struct vm_area_struct *,
+		unsigned long, unsigned long, pgprot_t, unsigned long, int);
 
 /* mm/page-writeback.c */
 int write_one_page(struct page *page, int wait);
diff --git a/ipc/shm.c b/ipc/shm.c
index 242c3f66493a..e2d090348b1e 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -224,13 +224,13 @@ static void shm_close(struct vm_area_struct *vma)
 	mutex_unlock(&shm_ids(ns).mutex);
 }
 
-static struct page *shm_nopage(struct vm_area_struct *vma,
-			       unsigned long address, int *type)
+static struct page *shm_fault(struct vm_area_struct *vma,
+					struct fault_data *fdata)
 {
 	struct file *file = vma->vm_file;
 	struct shm_file_data *sfd = shm_file_data(file);
 
-	return sfd->vm_ops->nopage(vma, address, type);
+	return sfd->vm_ops->fault(vma, fdata);
 }
 
 #ifdef CONFIG_NUMA
@@ -269,6 +269,7 @@ static int shm_mmap(struct file * file, struct vm_area_struct * vma)
 	if (ret != 0)
 		return ret;
 	sfd->vm_ops = vma->vm_ops;
+	BUG_ON(!sfd->vm_ops->fault);
 	vma->vm_ops = &shm_vm_ops;
 	shm_open(vma);
 
@@ -327,7 +328,7 @@ static const struct file_operations shm_file_operations = {
 static struct vm_operations_struct shm_vm_ops = {
 	.open	= shm_open,	/* callback for a new vm-area open */
 	.close	= shm_close,	/* callback for when the vm-area is released */
-	.nopage	= shm_nopage,
+	.fault	= shm_fault,
 #if defined(CONFIG_NUMA)
 	.set_policy = shm_set_policy,
 	.get_policy = shm_get_policy,
diff --git a/mm/filemap.c b/mm/filemap.c
index 462cda58a18e..26b992d169e5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1301,40 +1301,38 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
 #define MMAP_LOTSAMISS  (100)
 
 /**
- * filemap_nopage - read in file data for page fault handling
- * @area:	the applicable vm_area
- * @address:	target address to read in
- * @type:	returned with VM_FAULT_{MINOR,MAJOR} if not %NULL
+ * filemap_fault - read in file data for page fault handling
+ * @vma:	user vma (not used)
+ * @fdata:	the applicable fault_data
  *
- * filemap_nopage() is invoked via the vma operations vector for a
+ * filemap_fault() is invoked via the vma operations vector for a
  * mapped memory region to read in file data during a page fault.
  *
  * The goto's are kind of ugly, but this streamlines the normal case of having
  * it in the page cache, and handles the special cases reasonably without
  * having a lot of duplicated code.
  */
-struct page *filemap_nopage(struct vm_area_struct *area,
-				unsigned long address, int *type)
+struct page *filemap_fault(struct vm_area_struct *vma, struct fault_data *fdata)
 {
 	int error;
-	struct file *file = area->vm_file;
+	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	struct file_ra_state *ra = &file->f_ra;
 	struct inode *inode = mapping->host;
 	struct page *page;
-	unsigned long size, pgoff;
-	int did_readaround = 0, majmin = VM_FAULT_MINOR;
+	unsigned long size;
+	int did_readaround = 0;
 
-	BUG_ON(!(area->vm_flags & VM_CAN_INVALIDATE));
+	fdata->type = VM_FAULT_MINOR;
 
-	pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+	BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE));
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (pgoff >= size)
+	if (fdata->pgoff >= size)
 		goto outside_data_content;
 
 	/* If we don't want any read-ahead, don't bother */
-	if (VM_RandomReadHint(area))
+	if (VM_RandomReadHint(vma))
 		goto no_cached_page;
 
 	/*
@@ -1343,19 +1341,19 @@ struct page *filemap_nopage(struct vm_area_struct *area,
 	 *
 	 * For sequential accesses, we use the generic readahead logic.
 	 */
-	if (VM_SequentialReadHint(area))
-		page_cache_readahead(mapping, ra, file, pgoff, 1);
+	if (VM_SequentialReadHint(vma))
+		page_cache_readahead(mapping, ra, file, fdata->pgoff, 1);
 
 	/*
 	 * Do we have something in the page cache already?
 	 */
 retry_find:
-	page = find_lock_page(mapping, pgoff);
+	page = find_lock_page(mapping, fdata->pgoff);
 	if (!page) {
 		unsigned long ra_pages;
 
-		if (VM_SequentialReadHint(area)) {
-			handle_ra_miss(mapping, ra, pgoff);
+		if (VM_SequentialReadHint(vma)) {
+			handle_ra_miss(mapping, ra, fdata->pgoff);
 			goto no_cached_page;
 		}
 		ra->mmap_miss++;
@@ -1372,7 +1370,7 @@ retry_find:
 		 * check did_readaround, as this is an inner loop.
 		 */
 		if (!did_readaround) {
-			majmin = VM_FAULT_MAJOR;
+			fdata->type = VM_FAULT_MAJOR;
 			count_vm_event(PGMAJFAULT);
 		}
 		did_readaround = 1;
@@ -1380,11 +1378,11 @@ retry_find:
 		if (ra_pages) {
 			pgoff_t start = 0;
 
-			if (pgoff > ra_pages / 2)
-				start = pgoff - ra_pages / 2;
+			if (fdata->pgoff > ra_pages / 2)
+				start = fdata->pgoff - ra_pages / 2;
 			do_page_cache_readahead(mapping, file, start, ra_pages);
 		}
-		page = find_lock_page(mapping, pgoff);
+		page = find_lock_page(mapping, fdata->pgoff);
 		if (!page)
 			goto no_cached_page;
 	}
@@ -1401,7 +1399,7 @@ retry_find:
 
 	/* Must recheck i_size under page lock */
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (unlikely(pgoff >= size)) {
+	if (unlikely(fdata->pgoff >= size)) {
 		unlock_page(page);
 		goto outside_data_content;
 	}
@@ -1410,8 +1408,6 @@ retry_find:
 	 * Found the page and have a reference on it.
 	 */
 	mark_page_accessed(page);
-	if (type)
-		*type = majmin;
 	return page;
 
 outside_data_content:
@@ -1419,15 +1415,17 @@ outside_data_content:
 	 * An external ptracer can access pages that normally aren't
 	 * accessible..
 	 */
-	if (area->vm_mm == current->mm)
-		return NOPAGE_SIGBUS;
+	if (vma->vm_mm == current->mm) {
+		fdata->type = VM_FAULT_SIGBUS;
+		return NULL;
+	}
 	/* Fall through to the non-read-ahead case */
 no_cached_page:
 	/*
 	 * We're only likely to ever get here if MADV_RANDOM is in
 	 * effect.
 	 */
-	error = page_cache_read(file, pgoff);
+	error = page_cache_read(file, fdata->pgoff);
 
 	/*
 	 * The page we want has now been added to the page cache.
@@ -1443,13 +1441,15 @@ no_cached_page:
 	 * to schedule I/O.
 	 */
 	if (error == -ENOMEM)
-		return NOPAGE_OOM;
-	return NOPAGE_SIGBUS;
+		fdata->type = VM_FAULT_OOM;
+	else
+		fdata->type = VM_FAULT_SIGBUS;
+	return NULL;
 
 page_not_uptodate:
 	/* IO error path */
 	if (!did_readaround) {
-		majmin = VM_FAULT_MAJOR;
+		fdata->type = VM_FAULT_MAJOR;
 		count_vm_event(PGMAJFAULT);
 	}
 
@@ -1468,7 +1468,30 @@ page_not_uptodate:
 
 	/* Things didn't work out. Return zero to tell the mm layer so. */
 	shrink_readahead_size_eio(file, ra);
-	return NOPAGE_SIGBUS;
+	fdata->type = VM_FAULT_SIGBUS;
+	return NULL;
+}
+EXPORT_SYMBOL(filemap_fault);
+
+/*
+ * filemap_nopage and filemap_populate are legacy exports that are not used
+ * in tree. Scheduled for removal.
+ */
+struct page *filemap_nopage(struct vm_area_struct *area,
+				unsigned long address, int *type)
+{
+	struct page *page;
+	struct fault_data fdata;
+	fdata.address = address;
+	fdata.pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT)
+			+ area->vm_pgoff;
+	fdata.flags = 0;
+
+	page = filemap_fault(area, &fdata);
+	if (type)
+		*type = fdata.type;
+
+	return page;
 }
 EXPORT_SYMBOL(filemap_nopage);
 
@@ -1646,8 +1669,7 @@ repeat:
 EXPORT_SYMBOL(filemap_populate);
 
 struct vm_operations_struct generic_file_vm_ops = {
-	.nopage		= filemap_nopage,
-	.populate	= filemap_populate,
+	.fault		= filemap_fault,
 };
 
 /* This is used for a general mmap of a disk file */
@@ -1660,7 +1682,7 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
 		return -ENOEXEC;
 	file_accessed(file);
 	vma->vm_ops = &generic_file_vm_ops;
-	vma->vm_flags |= VM_CAN_INVALIDATE;
+	vma->vm_flags |= VM_CAN_INVALIDATE | VM_CAN_NONLINEAR;
 	return 0;
 }
 
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 65ffc321f0c0..82f4b8e9834e 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -205,62 +205,67 @@ __xip_unmap (struct address_space * mapping,
 }
 
 /*
- * xip_nopage() is invoked via the vma operations vector for a
+ * xip_fault() is invoked via the vma operations vector for a
  * mapped memory region to read in file data during a page fault.
  *
- * This function is derived from filemap_nopage, but used for execute in place
+ * This function is derived from filemap_fault, but used for execute in place
  */
-static struct page *
-xip_file_nopage(struct vm_area_struct * area,
-		   unsigned long address,
-		   int *type)
+static struct page *xip_file_fault(struct vm_area_struct *area,
+					struct fault_data *fdata)
 {
 	struct file *file = area->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	struct page *page;
-	unsigned long size, pgoff, endoff;
+	pgoff_t size;
 
-	pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT)
-		+ area->vm_pgoff;
-	endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT)
-		+ area->vm_pgoff;
+	/* XXX: are VM_FAULT_ codes OK? */
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (pgoff >= size)
-		return NOPAGE_SIGBUS;
+	if (fdata->pgoff >= size) {
+		fdata->type = VM_FAULT_SIGBUS;
+		return NULL;
+	}
 
-	page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
+	page = mapping->a_ops->get_xip_page(mapping,
+					fdata->pgoff*(PAGE_SIZE/512), 0);
 	if (!IS_ERR(page))
 		goto out;
-	if (PTR_ERR(page) != -ENODATA)
-		return NOPAGE_SIGBUS;
+	if (PTR_ERR(page) != -ENODATA) {
+		fdata->type = VM_FAULT_OOM;
+		return NULL;
+	}
 
 	/* sparse block */
 	if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
 	    (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) &&
 	    (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
 		/* maybe shared writable, allocate new block */
-		page = mapping->a_ops->get_xip_page (mapping,
-			pgoff*(PAGE_SIZE/512), 1);
-		if (IS_ERR(page))
-			return NOPAGE_SIGBUS;
+		page = mapping->a_ops->get_xip_page(mapping,
+					fdata->pgoff*(PAGE_SIZE/512), 1);
+		if (IS_ERR(page)) {
+			fdata->type = VM_FAULT_SIGBUS;
+			return NULL;
+		}
 		/* unmap page at pgoff from all other vmas */
-		__xip_unmap(mapping, pgoff);
+		__xip_unmap(mapping, fdata->pgoff);
 	} else {
 		/* not shared and writable, use xip_sparse_page() */
 		page = xip_sparse_page();
-		if (!page)
-			return NOPAGE_OOM;
+		if (!page) {
+			fdata->type = VM_FAULT_OOM;
+			return NULL;
+		}
 	}
 
 out:
+	fdata->type = VM_FAULT_MINOR;
 	page_cache_get(page);
 	return page;
 }
 
 static struct vm_operations_struct xip_file_vm_ops = {
-	.nopage         = xip_file_nopage,
+	.fault	= xip_file_fault,
 };
 
 int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -269,6 +274,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
 
 	file_accessed(file);
 	vma->vm_ops = &xip_file_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xip_file_mmap);
diff --git a/mm/fremap.c b/mm/fremap.c
index 4e3f53dd5fd4..01e51f01b84e 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -126,6 +126,25 @@ out:
 	return err;
 }
 
+static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long addr, unsigned long size, pgoff_t pgoff)
+{
+	int err;
+
+	do {
+		err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot);
+		if (err)
+			return err;
+
+		size -= PAGE_SIZE;
+		addr += PAGE_SIZE;
+		pgoff++;
+	} while (size);
+
+        return 0;
+
+}
+
 /***
  * sys_remap_file_pages - remap arbitrary pages of a shared backing store
  *                        file within an existing vma.
@@ -183,41 +202,63 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
 	 * the single existing vma.  vm_private_data is used as a
 	 * swapout cursor in a VM_NONLINEAR vma.
 	 */
-	if (vma && (vma->vm_flags & VM_SHARED) &&
-		(!vma->vm_private_data || (vma->vm_flags & VM_NONLINEAR)) &&
-		vma->vm_ops && vma->vm_ops->populate &&
-			end > start && start >= vma->vm_start &&
-				end <= vma->vm_end) {
-
-		/* Must set VM_NONLINEAR before any pages are populated. */
-		if (pgoff != linear_page_index(vma, start) &&
-		    !(vma->vm_flags & VM_NONLINEAR)) {
-			if (!has_write_lock) {
-				up_read(&mm->mmap_sem);
-				down_write(&mm->mmap_sem);
-				has_write_lock = 1;
-				goto retry;
+	if (!vma || !(vma->vm_flags & VM_SHARED))
+		goto out;
+
+	if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
+		goto out;
+
+	if ((!vma->vm_ops || !vma->vm_ops->populate) &&
+					!(vma->vm_flags & VM_CAN_NONLINEAR))
+		goto out;
+
+	if (end <= start || start < vma->vm_start || end > vma->vm_end)
+		goto out;
+
+	/* Must set VM_NONLINEAR before any pages are populated. */
+	if (!(vma->vm_flags & VM_NONLINEAR)) {
+		/* Don't need a nonlinear mapping, exit success */
+		if (pgoff == linear_page_index(vma, start)) {
+			err = 0;
+			goto out;
+		}
+
+		if (!has_write_lock) {
+			up_read(&mm->mmap_sem);
+			down_write(&mm->mmap_sem);
+			has_write_lock = 1;
+			goto retry;
+		}
+		mapping = vma->vm_file->f_mapping;
+		spin_lock(&mapping->i_mmap_lock);
+		flush_dcache_mmap_lock(mapping);
+		vma->vm_flags |= VM_NONLINEAR;
+		vma_prio_tree_remove(vma, &mapping->i_mmap);
+		vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
+		flush_dcache_mmap_unlock(mapping);
+		spin_unlock(&mapping->i_mmap_lock);
+	}
+
+	if (vma->vm_flags & VM_CAN_NONLINEAR) {
+		err = populate_range(mm, vma, start, size, pgoff);
+		if (!err && !(flags & MAP_NONBLOCK)) {
+			if (unlikely(has_write_lock)) {
+				downgrade_write(&mm->mmap_sem);
+				has_write_lock = 0;
 			}
-			mapping = vma->vm_file->f_mapping;
-			spin_lock(&mapping->i_mmap_lock);
-			flush_dcache_mmap_lock(mapping);
-			vma->vm_flags |= VM_NONLINEAR;
-			vma_prio_tree_remove(vma, &mapping->i_mmap);
-			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
-			flush_dcache_mmap_unlock(mapping);
-			spin_unlock(&mapping->i_mmap_lock);
+			make_pages_present(start, start+size);
 		}
+	} else
+		err = vma->vm_ops->populate(vma, start, size, vma->vm_page_prot,
+					    	pgoff, flags & MAP_NONBLOCK);
 
-		err = vma->vm_ops->populate(vma, start, size,
-					    vma->vm_page_prot,
-					    pgoff, flags & MAP_NONBLOCK);
+	/*
+	 * We can't clear VM_NONLINEAR because we'd have to do
+	 * it after ->populate completes, and that would prevent
+	 * downgrading the lock.  (Locks can't be upgraded).
+	 */
 
-		/*
-		 * We can't clear VM_NONLINEAR because we'd have to do
-		 * it after ->populate completes, and that would prevent
-		 * downgrading the lock.  (Locks can't be upgraded).
-		 */
-	}
+out:
 	if (likely(!has_write_lock))
 		up_read(&mm->mmap_sem);
 	else
diff --git a/mm/memory.c b/mm/memory.c
index e6c99f6b5649..eee7fec3ab54 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1047,7 +1047,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		if (pages)
 			foll_flags |= FOLL_GET;
 		if (!write && !(vma->vm_flags & VM_LOCKED) &&
-		    (!vma->vm_ops || !vma->vm_ops->nopage))
+		    (!vma->vm_ops || (!vma->vm_ops->nopage &&
+					!vma->vm_ops->fault)))
 			foll_flags |= FOLL_ANON;
 
 		do {
@@ -2288,10 +2289,10 @@ oom:
 }
 
 /*
- * do_no_page() tries to create a new page mapping. It aggressively
+ * __do_fault() tries to create a new page mapping. It aggressively
  * tries to share with existing pages, but makes a separate copy if
- * the "write_access" parameter is true in order to avoid the next
- * page fault.
+ * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
+ * the next page fault.
  *
  * As this is called only for pages that do not currently exist, we
  * do not need to flush old virtual caches or the TLB.
@@ -2300,64 +2301,82 @@ oom:
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
-static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		int write_access)
+		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
 {
 	spinlock_t *ptl;
-	struct page *page, *nopage_page;
+	struct page *page, *faulted_page;
 	pte_t entry;
-	int ret = VM_FAULT_MINOR;
 	int anon = 0;
 	struct page *dirty_page = NULL;
+	struct fault_data fdata;
+
+	fdata.address = address & PAGE_MASK;
+	fdata.pgoff = pgoff;
+	fdata.flags = flags;
 
 	pte_unmap(page_table);
 	BUG_ON(vma->vm_flags & VM_PFNMAP);
 
-	nopage_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
-	/* no page was available -- either SIGBUS, OOM or REFAULT */
-	if (unlikely(nopage_page == NOPAGE_SIGBUS))
-		return VM_FAULT_SIGBUS;
-	else if (unlikely(nopage_page == NOPAGE_OOM))
-		return VM_FAULT_OOM;
-	else if (unlikely(nopage_page == NOPAGE_REFAULT))
-		return VM_FAULT_MINOR;
+	if (likely(vma->vm_ops->fault)) {
+		fdata.type = -1;
+		faulted_page = vma->vm_ops->fault(vma, &fdata);
+		WARN_ON(fdata.type == -1);
+		if (unlikely(!faulted_page))
+			return fdata.type;
+	} else {
+		/* Legacy ->nopage path */
+		fdata.type = VM_FAULT_MINOR;
+		faulted_page = vma->vm_ops->nopage(vma, address & PAGE_MASK,
+								&fdata.type);
+		/* no page was available -- either SIGBUS or OOM */
+		if (unlikely(faulted_page == NOPAGE_SIGBUS))
+			return VM_FAULT_SIGBUS;
+		else if (unlikely(faulted_page == NOPAGE_OOM))
+			return VM_FAULT_OOM;
+	}
 
-	BUG_ON(vma->vm_flags & VM_CAN_INVALIDATE && !PageLocked(nopage_page));
 	/*
-	 * For consistency in subsequent calls, make the nopage_page always
+	 * For consistency in subsequent calls, make the faulted_page always
 	 * locked.
 	 */
 	if (unlikely(!(vma->vm_flags & VM_CAN_INVALIDATE)))
-		lock_page(nopage_page);
+		lock_page(faulted_page);
+	else
+		BUG_ON(!PageLocked(faulted_page));
 
 	/*
 	 * Should we do an early C-O-W break?
 	 */
-	page = nopage_page;
-	if (write_access) {
+	page = faulted_page;
+	if (flags & FAULT_FLAG_WRITE) {
 		if (!(vma->vm_flags & VM_SHARED)) {
+			anon = 1;
 			if (unlikely(anon_vma_prepare(vma))) {
-				ret = VM_FAULT_OOM;
-				goto out_error;
+				fdata.type = VM_FAULT_OOM;
+				goto out;
 			}
 			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 			if (!page) {
-				ret = VM_FAULT_OOM;
-				goto out_error;
+				fdata.type = VM_FAULT_OOM;
+				goto out;
 			}
-			copy_user_highpage(page, nopage_page, address, vma);
-			anon = 1;
+			copy_user_highpage(page, faulted_page, address, vma);
 		} else {
-			/* if the page will be shareable, see if the backing
+			/*
+			 * If the page will be shareable, see if the backing
 			 * address space wants to know that the page is about
-			 * to become writable */
+			 * to become writable
+			 */
 			if (vma->vm_ops->page_mkwrite &&
 			    vma->vm_ops->page_mkwrite(vma, page) < 0) {
-				ret = VM_FAULT_SIGBUS;
-				goto out_error;
+				fdata.type = VM_FAULT_SIGBUS;
+				anon = 1; /* no anon but release faulted_page */
+				goto out;
 			}
 		}
+
 	}
 
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2373,10 +2392,10 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * handle that later.
 	 */
 	/* Only go through if we didn't race with anybody else... */
-	if (likely(pte_none(*page_table))) {
+	if (likely(pte_same(*page_table, orig_pte))) {
 		flush_icache_page(vma, page);
 		entry = mk_pte(page, vma->vm_page_prot);
-		if (write_access)
+		if (flags & FAULT_FLAG_WRITE)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
@@ -2386,7 +2405,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		} else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(page);
-			if (write_access) {
+			if (flags & FAULT_FLAG_WRITE) {
 				dirty_page = page;
 				get_page(dirty_page);
 			}
@@ -2399,25 +2418,42 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (anon)
 			page_cache_release(page);
 		else
-			anon = 1; /* not anon, but release nopage_page */
+			anon = 1; /* no anon but release faulted_page */
 	}
 
 	pte_unmap_unlock(page_table, ptl);
 
 out:
-	unlock_page(nopage_page);
+	unlock_page(faulted_page);
 	if (anon)
-		page_cache_release(nopage_page);
+		page_cache_release(faulted_page);
 	else if (dirty_page) {
 		set_page_dirty_balance(dirty_page);
 		put_page(dirty_page);
 	}
 
-	return ret;
+	return fdata.type;
+}
 
-out_error:
-	anon = 1; /* relase nopage_page */
-	goto out;
+static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		int write_access, pte_t orig_pte)
+{
+	pgoff_t pgoff = (((address & PAGE_MASK)
+			- vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff;
+	unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
+
+	return __do_fault(mm, vma, address, page_table, pmd, pgoff, flags, orig_pte);
+}
+
+static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		int write_access, pgoff_t pgoff, pte_t orig_pte)
+{
+	unsigned int flags = FAULT_FLAG_NONLINEAR |
+				(write_access ? FAULT_FLAG_WRITE : 0);
+
+	return __do_fault(mm, vma, address, page_table, pmd, pgoff, flags, orig_pte);
 }
 
 /*
@@ -2496,9 +2532,14 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		print_bad_pte(vma, orig_pte, address);
 		return VM_FAULT_OOM;
 	}
-	/* We can then assume vm->vm_ops && vma->vm_ops->populate */
 
 	pgoff = pte_to_pgoff(orig_pte);
+
+	if (vma->vm_ops && vma->vm_ops->fault)
+		return do_nonlinear_fault(mm, vma, address, page_table, pmd,
+					write_access, pgoff, orig_pte);
+
+	/* We can then assume vm->vm_ops && vma->vm_ops->populate */
 	err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
 					vma->vm_page_prot, pgoff, 0);
 	if (err == -ENOMEM)
@@ -2532,10 +2573,9 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 	if (!pte_present(entry)) {
 		if (pte_none(entry)) {
 			if (vma->vm_ops) {
-				if (vma->vm_ops->nopage)
-					return do_no_page(mm, vma, address,
-							  pte, pmd,
-							  write_access);
+				if (vma->vm_ops->fault || vma->vm_ops->nopage)
+					return do_linear_fault(mm, vma, address,
+						pte, pmd, write_access, entry);
 				if (unlikely(vma->vm_ops->nopfn))
 					return do_no_pfn(mm, vma, address, pte,
 							 pmd, write_access);
diff --git a/mm/mmap.c b/mm/mmap.c
index 144b4a290f2c..724f342bcf89 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1165,12 +1165,8 @@ out:
 		mm->locked_vm += len >> PAGE_SHIFT;
 		make_pages_present(addr, addr + len);
 	}
-	if (flags & MAP_POPULATE) {
-		up_write(&mm->mmap_sem);
-		sys_remap_file_pages(addr, len, 0,
-					pgoff, flags & MAP_NONBLOCK);
-		down_write(&mm->mmap_sem);
-	}
+	if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
+		make_pages_present(addr, addr + len);
 	return addr;
 
 unmap_and_free_vma:
diff --git a/mm/nommu.c b/mm/nommu.c
index 8bbbf147a794..aee0e1b0ebe7 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1341,8 +1341,7 @@ int in_gate_area_no_task(unsigned long addr)
 	return 0;
 }
 
-struct page *filemap_nopage(struct vm_area_struct *area,
-			unsigned long address, int *type)
+struct page *filemap_fault(struct vm_area_struct *vma, struct fault_data *fdata)
 {
 	BUG();
 	return NULL;
diff --git a/mm/rmap.c b/mm/rmap.c
index 61e492597a0b..fede5c7910be 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -621,8 +621,10 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
 			printk (KERN_EMERG "  page->count = %x\n", page_count(page));
 			printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
 			print_symbol (KERN_EMERG "  vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
-			if (vma->vm_ops)
+			if (vma->vm_ops) {
 				print_symbol (KERN_EMERG "  vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage);
+				print_symbol (KERN_EMERG "  vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault);
+			}
 			if (vma->vm_file && vma->vm_file->f_op)
 				print_symbol (KERN_EMERG "  vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
 			BUG();
diff --git a/mm/shmem.c b/mm/shmem.c
index 5808fadd3944..6b44440f1b24 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -83,7 +83,7 @@ enum sgp_type {
 	SGP_READ,	/* don't exceed i_size, don't allocate page */
 	SGP_CACHE,	/* don't exceed i_size, may allocate page */
 	SGP_WRITE,	/* may exceed i_size, may allocate page */
-	SGP_NOPAGE,	/* same as SGP_CACHE, return with page locked */
+	SGP_FAULT,	/* same as SGP_CACHE, return with page locked */
 };
 
 static int shmem_getpage(struct inode *inode, unsigned long idx,
@@ -1101,6 +1101,10 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
 
 	if (idx >= SHMEM_MAX_INDEX)
 		return -EFBIG;
+
+	if (type)
+		*type = VM_FAULT_MINOR;
+
 	/*
 	 * Normally, filepage is NULL on entry, and either found
 	 * uptodate immediately, or allocated and zeroed, or read
@@ -1291,7 +1295,7 @@ repeat:
 done:
 	if (*pagep != filepage) {
 		*pagep = filepage;
-		if (sgp != SGP_NOPAGE)
+		if (sgp != SGP_FAULT)
 			unlock_page(filepage);
 
 	}
@@ -1305,76 +1309,31 @@ failed:
 	return error;
 }
 
-static struct page *shmem_nopage(struct vm_area_struct *vma,
-				 unsigned long address, int *type)
+static struct page *shmem_fault(struct vm_area_struct *vma,
+					struct fault_data *fdata)
 {
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct page *page = NULL;
-	unsigned long idx;
 	int error;
 
 	BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE));
 
-	idx = (address - vma->vm_start) >> PAGE_SHIFT;
-	idx += vma->vm_pgoff;
-	idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
-	if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-		return NOPAGE_SIGBUS;
+	if (((loff_t)fdata->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+		fdata->type = VM_FAULT_SIGBUS;
+		return NULL;
+	}
 
-	error = shmem_getpage(inode, idx, &page, SGP_NOPAGE, type);
-	if (error)
-		return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
+	error = shmem_getpage(inode, fdata->pgoff, &page,
+						SGP_FAULT, &fdata->type);
+	if (error) {
+		fdata->type = ((error == -ENOMEM)?VM_FAULT_OOM:VM_FAULT_SIGBUS);
+		return NULL;
+	}
 
 	mark_page_accessed(page);
 	return page;
 }
 
-static int shmem_populate(struct vm_area_struct *vma,
-	unsigned long addr, unsigned long len,
-	pgprot_t prot, unsigned long pgoff, int nonblock)
-{
-	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
-	struct mm_struct *mm = vma->vm_mm;
-	enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
-	unsigned long size;
-
-	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size)
-		return -EINVAL;
-
-	while ((long) len > 0) {
-		struct page *page = NULL;
-		int err;
-		/*
-		 * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE
-		 */
-		err = shmem_getpage(inode, pgoff, &page, sgp, NULL);
-		if (err)
-			return err;
-		/* Page may still be null, but only if nonblock was set. */
-		if (page) {
-			mark_page_accessed(page);
-			err = install_page(mm, vma, addr, page, prot);
-			if (err) {
-				page_cache_release(page);
-				return err;
-			}
-		} else if (vma->vm_flags & VM_NONLINEAR) {
-			/* No page was found just because we can't read it in
-			 * now (being here implies nonblock != 0), but the page
-			 * may exist, so set the PTE to fault it in later. */
-    			err = install_file_pte(mm, vma, addr, pgoff, prot);
-			if (err)
-	    			return err;
-		}
-
-		len -= PAGE_SIZE;
-		addr += PAGE_SIZE;
-		pgoff++;
-	}
-	return 0;
-}
-
 #ifdef CONFIG_NUMA
 int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
 {
@@ -1419,7 +1378,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	file_accessed(file);
 	vma->vm_ops = &shmem_vm_ops;
-	vma->vm_flags |= VM_CAN_INVALIDATE;
+	vma->vm_flags |= VM_CAN_INVALIDATE | VM_CAN_NONLINEAR;
 	return 0;
 }
 
@@ -2465,8 +2424,7 @@ static const struct super_operations shmem_ops = {
 };
 
 static struct vm_operations_struct shmem_vm_ops = {
-	.nopage		= shmem_nopage,
-	.populate	= shmem_populate,
+	.fault		= shmem_fault,
 #ifdef CONFIG_NUMA
 	.set_policy     = shmem_set_policy,
 	.get_policy     = shmem_get_policy,
diff --git a/mm/truncate.c b/mm/truncate.c
index aed85f0b707f..5cdfbc1a59fd 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -82,7 +82,7 @@ EXPORT_SYMBOL(cancel_dirty_page);
 /*
  * If truncate cannot remove the fs-private metadata from the page, the page
  * becomes anonymous.  It will be left on the LRU and may even be mapped into
- * user pagetables if we're racing with filemap_nopage().
+ * user pagetables if we're racing with filemap_fault().
  *
  * We need to bale out if page->mapping is no longer equal to the original
  * mapping.  This happens a) when the VM reclaimed the page while we waited on
-- 
cgit v1.3-8-gc7d7


From d0217ac04ca6591841e5665f518e38064f4e65bd Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Thu, 19 Jul 2007 01:47:03 -0700
Subject: mm: fault feedback #1

Change ->fault prototype.  We now return an int, which contains
VM_FAULT_xxx code in the low byte, and FAULT_RET_xxx code in the next byte.
 FAULT_RET_ code tells the VM whether a page was found, whether it has been
locked, and potentially other things.  This is not quite the way he wanted
it yet, but that's changed in the next patch (which requires changes to
arch code).

This means we no longer set VM_CAN_INVALIDATE in the vma in order to say
that a page is locked which requires filemap_nopage to go away (because we
can no longer remain backward compatible without that flag), but we were
going to do that anyway.

struct fault_data is renamed to struct vm_fault as Linus asked. address
is now a void __user * that we should firmly encourage drivers not to use
without really good reason.

The page is now returned via a page pointer in the vm_fault struct.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/feature-removal-schedule.txt |  20 +--
 Documentation/filesystems/Locking          |   2 +-
 fs/gfs2/ops_file.c                         |   2 -
 fs/gfs2/ops_vm.c                           |  47 +++---
 fs/ncpfs/mmap.c                            |  38 ++---
 fs/ocfs2/mmap.c                            |  30 ++--
 fs/xfs/linux-2.6/xfs_file.c                |  14 +-
 include/linux/mm.h                         |  84 ++++++----
 ipc/shm.c                                  |   5 +-
 mm/filemap.c                               | 249 +++--------------------------
 mm/filemap_xip.c                           |  37 ++---
 mm/fremap.c                                |  85 ++--------
 mm/hugetlb.c                               |   7 +-
 mm/memory.c                                | 109 ++++++-------
 mm/nommu.c                                 |   4 +-
 mm/shmem.c                                 |  29 ++--
 16 files changed, 238 insertions(+), 524 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 716568afdff8..cff63befeb9a 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -135,26 +135,8 @@ Who:	Greg Kroah-Hartman <gregkh@suse.de>
 
 ---------------------------
 
-What:	filemap_nopage, filemap_populate
-When:	April 2007
-Why:	These legacy interfaces no longer have any callers in the kernel and
-	any functionality provided can be provided with filemap_fault. The
-	removal schedule is short because they are a big maintainence burden
-	and have some bugs.
-Who:	Nick Piggin <npiggin@suse.de>
-
----------------------------
-
-What:	vm_ops.populate, install_page
-When:	April 2007
-Why:	These legacy interfaces no longer have any callers in the kernel and
-	any functionality provided can be provided with vm_ops.fault.
-Who:	Nick Piggin <npiggin@suse.de>
-
----------------------------
-
 What:	vm_ops.nopage
-When:	February 2008, provided in-kernel callers have been converted
+When:	Soon, provided in-kernel callers have been converted
 Why:	This interface is replaced by vm_ops.fault, but it has been around
 	forever, is used by a lot of drivers, and doesn't cost much to
 	maintain.
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 91ec4b40ebfe..f0f825808ca4 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -510,7 +510,7 @@ More details about quota locking can be found in fs/dquot.c.
 prototypes:
 	void (*open)(struct vm_area_struct*);
 	void (*close)(struct vm_area_struct*);
-	struct page *(*fault)(struct vm_area_struct*, struct fault_data *);
+	int (*fault)(struct vm_area_struct*, struct vm_fault *);
 	struct page *(*nopage)(struct vm_area_struct*, unsigned long, int *);
 	int (*page_mkwrite)(struct vm_area_struct *, struct page *);
 
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 581ac11b2656..1a5e8e893d75 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -364,8 +364,6 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 	else
 		vma->vm_ops = &gfs2_vm_ops_private;
 
-	vma->vm_flags |= VM_CAN_INVALIDATE|VM_CAN_NONLINEAR;
-
 	gfs2_glock_dq_uninit(&i_gh);
 
 	return error;
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
index e9fe6eb74e75..dc287d2e3a66 100644
--- a/fs/gfs2/ops_vm.c
+++ b/fs/gfs2/ops_vm.c
@@ -27,13 +27,12 @@
 #include "trans.h"
 #include "util.h"
 
-static struct page *gfs2_private_fault(struct vm_area_struct *vma,
-					struct fault_data *fdata)
+static int gfs2_private_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct gfs2_inode *ip = GFS2_I(vma->vm_file->f_mapping->host);
 
 	set_bit(GIF_PAGED, &ip->i_flags);
-	return filemap_fault(vma, fdata);
+	return filemap_fault(vma, vmf);
 }
 
 static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
@@ -104,55 +103,55 @@ out:
 	return error;
 }
 
-static struct page *gfs2_sharewrite_fault(struct vm_area_struct *vma,
-						struct fault_data *fdata)
+static int gfs2_sharewrite_fault(struct vm_area_struct *vma,
+						struct vm_fault *vmf)
 {
 	struct file *file = vma->vm_file;
 	struct gfs2_file *gf = file->private_data;
 	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
 	struct gfs2_holder i_gh;
-	struct page *result = NULL;
 	int alloc_required;
 	int error;
+	int ret = VM_FAULT_MINOR;
 
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
 	if (error)
-		return NULL;
+		goto out;
 
 	set_bit(GIF_PAGED, &ip->i_flags);
 	set_bit(GIF_SW_PAGED, &ip->i_flags);
 
 	error = gfs2_write_alloc_required(ip,
-					(u64)fdata->pgoff << PAGE_CACHE_SHIFT,
+					(u64)vmf->pgoff << PAGE_CACHE_SHIFT,
 					PAGE_CACHE_SIZE, &alloc_required);
 	if (error) {
-		fdata->type = VM_FAULT_OOM; /* XXX: are these right? */
-		goto out;
+		ret = VM_FAULT_OOM; /* XXX: are these right? */
+		goto out_unlock;
 	}
 
 	set_bit(GFF_EXLOCK, &gf->f_flags);
-	result = filemap_fault(vma, fdata);
+	ret = filemap_fault(vma, vmf);
 	clear_bit(GFF_EXLOCK, &gf->f_flags);
-	if (!result)
-		goto out;
+	if (ret & (VM_FAULT_ERROR | FAULT_RET_NOPAGE))
+		goto out_unlock;
 
 	if (alloc_required) {
-		error = alloc_page_backing(ip, result);
+		/* XXX: do we need to drop page lock around alloc_page_backing?*/
+		error = alloc_page_backing(ip, vmf->page);
 		if (error) {
-			if (vma->vm_flags & VM_CAN_INVALIDATE)
-				unlock_page(result);
-			page_cache_release(result);
-			fdata->type = VM_FAULT_OOM;
-			result = NULL;
-			goto out;
+			if (ret & FAULT_RET_LOCKED)
+				unlock_page(vmf->page);
+			page_cache_release(vmf->page);
+			ret = VM_FAULT_OOM;
+			goto out_unlock;
 		}
-		set_page_dirty(result);
+		set_page_dirty(vmf->page);
 	}
 
-out:
+out_unlock:
 	gfs2_glock_dq_uninit(&i_gh);
-
-	return result;
+out:
+	return ret;
 }
 
 struct vm_operations_struct gfs2_vm_ops_private = {
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index af48b792ca04..a94473d3072c 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -24,33 +24,35 @@
 
 /*
  * Fill in the supplied page for mmap
+ * XXX: how are we excluding truncate/invalidate here? Maybe need to lock
+ * page?
  */
-static struct page* ncp_file_mmap_fault(struct vm_area_struct *area,
-						struct fault_data *fdata)
+static int ncp_file_mmap_fault(struct vm_area_struct *area,
+					struct vm_fault *vmf)
 {
 	struct file *file = area->vm_file;
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
-	struct page* page;
 	char *pg_addr;
 	unsigned int already_read;
 	unsigned int count;
 	int bufsize;
-	int pos;
+	int pos; /* XXX: loff_t ? */
 
-	page = alloc_page(GFP_HIGHUSER); /* ncpfs has nothing against high pages
-	           as long as recvmsg and memset works on it */
-	if (!page) {
-		fdata->type = VM_FAULT_OOM;
-		return NULL;
-	}
-	pg_addr = kmap(page);
-	pos = fdata->pgoff << PAGE_SHIFT;
+	/*
+	 * ncpfs has nothing against high pages as long
+	 * as recvmsg and memset works on it
+	 */
+	vmf->page = alloc_page(GFP_HIGHUSER);
+	if (!vmf->page)
+		return VM_FAULT_OOM;
+	pg_addr = kmap(vmf->page);
+	pos = vmf->pgoff << PAGE_SHIFT;
 
 	count = PAGE_SIZE;
-	if (fdata->address + PAGE_SIZE > area->vm_end) {
+	if ((unsigned long)vmf->virtual_address + PAGE_SIZE > area->vm_end) {
 		WARN_ON(1); /* shouldn't happen? */
-		count = area->vm_end - fdata->address;
+		count = area->vm_end - (unsigned long)vmf->virtual_address;
 	}
 	/* what we can read in one go */
 	bufsize = NCP_SERVER(inode)->buffer_size;
@@ -85,17 +87,16 @@ static struct page* ncp_file_mmap_fault(struct vm_area_struct *area,
 
 	if (already_read < PAGE_SIZE)
 		memset(pg_addr + already_read, 0, PAGE_SIZE - already_read);
-	flush_dcache_page(page);
-	kunmap(page);
+	flush_dcache_page(vmf->page);
+	kunmap(vmf->page);
 
 	/*
 	 * If I understand ncp_read_kernel() properly, the above always
 	 * fetches from the network, here the analogue of disk.
 	 * -- wli
 	 */
-	fdata->type = VM_FAULT_MAJOR;
 	count_vm_event(PGMAJFAULT);
-	return page;
+	return VM_FAULT_MAJOR;
 }
 
 static struct vm_operations_struct ncp_file_mmap =
@@ -124,7 +125,6 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EFBIG;
 
 	vma->vm_ops = &ncp_file_mmap;
-	vma->vm_flags |= VM_CAN_INVALIDATE;
 	file_accessed(file);
 	return 0;
 }
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index cd75508b1c8a..ee64749e2eeb 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -60,30 +60,28 @@ static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
 	return sigprocmask(SIG_SETMASK, oldset, NULL);
 }
 
-static struct page *ocfs2_fault(struct vm_area_struct *area,
-						struct fault_data *fdata)
+static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 {
-	struct page *page = NULL;
 	sigset_t blocked, oldset;
-	int ret;
+	int error, ret;
 
-	mlog_entry("(area=%p, page offset=%lu)\n", area, fdata->pgoff);
+	mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
 
-	ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
-	if (ret < 0) {
-		fdata->type = VM_FAULT_SIGBUS;
-		mlog_errno(ret);
+	error = ocfs2_vm_op_block_sigs(&blocked, &oldset);
+	if (error < 0) {
+		mlog_errno(error);
+		ret = VM_FAULT_SIGBUS;
 		goto out;
 	}
 
-	page = filemap_fault(area, fdata);
+	ret = filemap_fault(area, vmf);
 
-	ret = ocfs2_vm_op_unblock_sigs(&oldset);
-	if (ret < 0)
-		mlog_errno(ret);
+	error = ocfs2_vm_op_unblock_sigs(&oldset);
+	if (error < 0)
+		mlog_errno(error);
 out:
-	mlog_exit_ptr(page);
-	return page;
+	mlog_exit_ptr(vmf->page);
+	return ret;
 }
 
 static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
@@ -225,7 +223,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
 	ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level);
 out:
 	vma->vm_ops = &ocfs2_file_vm_ops;
-	vma->vm_flags |= VM_CAN_INVALIDATE | VM_CAN_NONLINEAR;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index f12e80a69c68..2d4be2f247b2 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -212,20 +212,18 @@ xfs_file_fsync(
 }
 
 #ifdef CONFIG_XFS_DMAPI
-STATIC struct page *
+STATIC int
 xfs_vm_fault(
 	struct vm_area_struct	*vma,
-	struct fault_data	*fdata)
+	struct vm_fault	*vmf)
 {
 	struct inode	*inode = vma->vm_file->f_path.dentry->d_inode;
 	bhv_vnode_t	*vp = vn_from_inode(inode);
 
 	ASSERT_ALWAYS(vp->v_vfsp->vfs_flag & VFS_DMI);
-	if (XFS_SEND_MMAP(XFS_VFSTOM(vp->v_vfsp), vma, 0)) {
-		fdata->type = VM_FAULT_SIGBUS;
-		return NULL;
-	}
-	return filemap_fault(vma, fdata);
+	if (XFS_SEND_MMAP(XFS_VFSTOM(vp->v_vfsp), vma, 0))
+		return VM_FAULT_SIGBUS;
+	return filemap_fault(vma, vmf);
 }
 #endif /* CONFIG_XFS_DMAPI */
 
@@ -311,7 +309,7 @@ xfs_file_mmap(
 	struct vm_area_struct *vma)
 {
 	vma->vm_ops = &xfs_file_vm_ops;
-	vma->vm_flags |= VM_CAN_INVALIDATE | VM_CAN_NONLINEAR;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
 
 #ifdef CONFIG_XFS_DMAPI
 	if (vn_from_inode(filp->f_path.dentry->d_inode)->v_vfsp->vfs_flag & VFS_DMI)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f28a1b3e63a9..ff0b8844bd5a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -168,12 +168,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_INSERTPAGE	0x02000000	/* The vma has had "vm_insert_page()" done on it */
 #define VM_ALWAYSDUMP	0x04000000	/* Always include in core dumps */
 
-#define VM_CAN_INVALIDATE 0x08000000	/* The mapping may be invalidated,
-					 * eg. truncate or invalidate_inode_*.
-					 * In this case, do_no_page must
-					 * return with the page locked.
-					 */
-#define VM_CAN_NONLINEAR 0x10000000	/* Has ->fault & does nonlinear pages */
+#define VM_CAN_NONLINEAR 0x08000000	/* Has ->fault & does nonlinear pages */
 
 #ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
@@ -197,24 +192,44 @@ extern unsigned int kobjsize(const void *objp);
  */
 extern pgprot_t protection_map[16];
 
-#define FAULT_FLAG_WRITE	0x01
-#define FAULT_FLAG_NONLINEAR	0x02
+#define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
+#define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
+
+
+#define FAULT_RET_NOPAGE	0x0100	/* ->fault did not return a page. This
+					 * can be used if the handler installs
+					 * their own pte.
+					 */
+#define FAULT_RET_LOCKED	0x0200	/* ->fault locked the page, caller must
+					 * unlock after installing the mapping.
+					 * This is used by pagecache in
+					 * particular, where the page lock is
+					 * used to synchronise against truncate
+					 * and invalidate. Mutually exclusive
+					 * with FAULT_RET_NOPAGE.
+					 */
 
 /*
- * fault_data is filled in the the pagefault handler and passed to the
- * vma's ->fault function. That function is responsible for filling in
- * 'type', which is the type of fault if a page is returned, or the type
- * of error if NULL is returned.
+ * vm_fault is filled by the the pagefault handler and passed to the vma's
+ * ->fault function. The vma's ->fault is responsible for returning the
+ * VM_FAULT_xxx type which occupies the lowest byte of the return code, ORed
+ * with FAULT_RET_ flags that occupy the next byte and give details about
+ * how the fault was handled.
  *
- * pgoff should be used in favour of address, if possible. If pgoff is
- * used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get
- * nonlinear mapping support.
+ * pgoff should be used in favour of virtual_address, if possible. If pgoff
+ * is used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get nonlinear
+ * mapping support.
  */
-struct fault_data {
-	unsigned long address;
-	pgoff_t pgoff;
-	unsigned int flags;
-	int type;
+struct vm_fault {
+	unsigned int flags;		/* FAULT_FLAG_xxx flags */
+	pgoff_t pgoff;			/* Logical page offset based on vma */
+	void __user *virtual_address;	/* Faulting virtual address */
+
+	struct page *page;		/* ->fault handlers should return a
+					 * page here, unless FAULT_RET_NOPAGE
+					 * is set (which is also implied by
+					 * VM_FAULT_OOM or SIGBUS).
+					 */
 };
 
 /*
@@ -225,15 +240,11 @@ struct fault_data {
 struct vm_operations_struct {
 	void (*open)(struct vm_area_struct * area);
 	void (*close)(struct vm_area_struct * area);
-	struct page *(*fault)(struct vm_area_struct *vma,
-			struct fault_data *fdata);
+	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
 	struct page *(*nopage)(struct vm_area_struct *area,
 			unsigned long address, int *type);
 	unsigned long (*nopfn)(struct vm_area_struct *area,
 			unsigned long address);
-	int (*populate)(struct vm_area_struct *area, unsigned long address,
-			unsigned long len, pgprot_t prot, unsigned long pgoff,
-			int nonblock);
 
 	/* notification that a previously read-only page is about to become
 	 * writable, if an error is returned it will cause a SIGBUS */
@@ -700,8 +711,14 @@ static inline int page_mapped(struct page *page)
  * Used to decide whether a process gets delivered SIGBUS or
  * just gets major/minor fault counters bumped up.
  */
-#define VM_FAULT_OOM	0x00
-#define VM_FAULT_SIGBUS	0x01
+
+/*
+ * VM_FAULT_ERROR is set for the error cases, to make some tests simpler.
+ */
+#define VM_FAULT_ERROR	0x20
+
+#define VM_FAULT_OOM	(0x00 | VM_FAULT_ERROR)
+#define VM_FAULT_SIGBUS	(0x01 | VM_FAULT_ERROR)
 #define VM_FAULT_MINOR	0x02
 #define VM_FAULT_MAJOR	0x03
 
@@ -711,6 +728,11 @@ static inline int page_mapped(struct page *page)
  */
 #define VM_FAULT_WRITE	0x10
 
+/*
+ * Mask of VM_FAULT_ flags
+ */
+#define VM_FAULT_MASK	0xff
+
 #define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
 
 extern void show_free_areas(void);
@@ -793,8 +815,6 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
 
 extern int vmtruncate(struct inode * inode, loff_t offset);
 extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
-extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
-extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
 
 #ifdef CONFIG_MMU
 extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma,
@@ -1135,11 +1155,7 @@ extern void truncate_inode_pages_range(struct address_space *,
 				       loff_t lstart, loff_t lend);
 
 /* generic vm_area_ops exported for stackable file systems */
-extern struct page *filemap_fault(struct vm_area_struct *, struct fault_data *);
-extern struct page * __deprecated_for_modules
-filemap_nopage(struct vm_area_struct *, unsigned long, int *);
-extern int __deprecated_for_modules filemap_populate(struct vm_area_struct *,
-		unsigned long, unsigned long, pgprot_t, unsigned long, int);
+extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
 
 /* mm/page-writeback.c */
 int write_one_page(struct page *page, int wait);
diff --git a/ipc/shm.c b/ipc/shm.c
index e2d090348b1e..d0259e3ad1c0 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -224,13 +224,12 @@ static void shm_close(struct vm_area_struct *vma)
 	mutex_unlock(&shm_ids(ns).mutex);
 }
 
-static struct page *shm_fault(struct vm_area_struct *vma,
-					struct fault_data *fdata)
+static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct file *file = vma->vm_file;
 	struct shm_file_data *sfd = shm_file_data(file);
 
-	return sfd->vm_ops->fault(vma, fdata);
+	return sfd->vm_ops->fault(vma, vmf);
 }
 
 #ifdef CONFIG_NUMA
diff --git a/mm/filemap.c b/mm/filemap.c
index 26b992d169e5..0876cc57255f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1302,8 +1302,8 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
 
 /**
  * filemap_fault - read in file data for page fault handling
- * @vma:	user vma (not used)
- * @fdata:	the applicable fault_data
+ * @vma:	vma in which the fault was taken
+ * @vmf:	struct vm_fault containing details of the fault
  *
  * filemap_fault() is invoked via the vma operations vector for a
  * mapped memory region to read in file data during a page fault.
@@ -1312,7 +1312,7 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
  * it in the page cache, and handles the special cases reasonably without
  * having a lot of duplicated code.
  */
-struct page *filemap_fault(struct vm_area_struct *vma, struct fault_data *fdata)
+int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	int error;
 	struct file *file = vma->vm_file;
@@ -1322,13 +1322,12 @@ struct page *filemap_fault(struct vm_area_struct *vma, struct fault_data *fdata)
 	struct page *page;
 	unsigned long size;
 	int did_readaround = 0;
+	int ret;
 
-	fdata->type = VM_FAULT_MINOR;
-
-	BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE));
+	ret = VM_FAULT_MINOR;
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (fdata->pgoff >= size)
+	if (vmf->pgoff >= size)
 		goto outside_data_content;
 
 	/* If we don't want any read-ahead, don't bother */
@@ -1342,18 +1341,18 @@ struct page *filemap_fault(struct vm_area_struct *vma, struct fault_data *fdata)
 	 * For sequential accesses, we use the generic readahead logic.
 	 */
 	if (VM_SequentialReadHint(vma))
-		page_cache_readahead(mapping, ra, file, fdata->pgoff, 1);
+		page_cache_readahead(mapping, ra, file, vmf->pgoff, 1);
 
 	/*
 	 * Do we have something in the page cache already?
 	 */
 retry_find:
-	page = find_lock_page(mapping, fdata->pgoff);
+	page = find_lock_page(mapping, vmf->pgoff);
 	if (!page) {
 		unsigned long ra_pages;
 
 		if (VM_SequentialReadHint(vma)) {
-			handle_ra_miss(mapping, ra, fdata->pgoff);
+			handle_ra_miss(mapping, ra, vmf->pgoff);
 			goto no_cached_page;
 		}
 		ra->mmap_miss++;
@@ -1370,7 +1369,7 @@ retry_find:
 		 * check did_readaround, as this is an inner loop.
 		 */
 		if (!did_readaround) {
-			fdata->type = VM_FAULT_MAJOR;
+			ret = VM_FAULT_MAJOR;
 			count_vm_event(PGMAJFAULT);
 		}
 		did_readaround = 1;
@@ -1378,11 +1377,11 @@ retry_find:
 		if (ra_pages) {
 			pgoff_t start = 0;
 
-			if (fdata->pgoff > ra_pages / 2)
-				start = fdata->pgoff - ra_pages / 2;
+			if (vmf->pgoff > ra_pages / 2)
+				start = vmf->pgoff - ra_pages / 2;
 			do_page_cache_readahead(mapping, file, start, ra_pages);
 		}
-		page = find_lock_page(mapping, fdata->pgoff);
+		page = find_lock_page(mapping, vmf->pgoff);
 		if (!page)
 			goto no_cached_page;
 	}
@@ -1399,7 +1398,7 @@ retry_find:
 
 	/* Must recheck i_size under page lock */
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (unlikely(fdata->pgoff >= size)) {
+	if (unlikely(vmf->pgoff >= size)) {
 		unlock_page(page);
 		goto outside_data_content;
 	}
@@ -1408,24 +1407,24 @@ retry_find:
 	 * Found the page and have a reference on it.
 	 */
 	mark_page_accessed(page);
-	return page;
+	vmf->page = page;
+	return ret | FAULT_RET_LOCKED;
 
 outside_data_content:
 	/*
 	 * An external ptracer can access pages that normally aren't
 	 * accessible..
 	 */
-	if (vma->vm_mm == current->mm) {
-		fdata->type = VM_FAULT_SIGBUS;
-		return NULL;
-	}
+	if (vma->vm_mm == current->mm)
+		return VM_FAULT_SIGBUS;
+
 	/* Fall through to the non-read-ahead case */
 no_cached_page:
 	/*
 	 * We're only likely to ever get here if MADV_RANDOM is in
 	 * effect.
 	 */
-	error = page_cache_read(file, fdata->pgoff);
+	error = page_cache_read(file, vmf->pgoff);
 
 	/*
 	 * The page we want has now been added to the page cache.
@@ -1441,15 +1440,13 @@ no_cached_page:
 	 * to schedule I/O.
 	 */
 	if (error == -ENOMEM)
-		fdata->type = VM_FAULT_OOM;
-	else
-		fdata->type = VM_FAULT_SIGBUS;
-	return NULL;
+		return VM_FAULT_OOM;
+	return VM_FAULT_SIGBUS;
 
 page_not_uptodate:
 	/* IO error path */
 	if (!did_readaround) {
-		fdata->type = VM_FAULT_MAJOR;
+		ret = VM_FAULT_MAJOR;
 		count_vm_event(PGMAJFAULT);
 	}
 
@@ -1468,206 +1465,10 @@ page_not_uptodate:
 
 	/* Things didn't work out. Return zero to tell the mm layer so. */
 	shrink_readahead_size_eio(file, ra);
-	fdata->type = VM_FAULT_SIGBUS;
-	return NULL;
+	return VM_FAULT_SIGBUS;
 }
 EXPORT_SYMBOL(filemap_fault);
 
-/*
- * filemap_nopage and filemap_populate are legacy exports that are not used
- * in tree. Scheduled for removal.
- */
-struct page *filemap_nopage(struct vm_area_struct *area,
-				unsigned long address, int *type)
-{
-	struct page *page;
-	struct fault_data fdata;
-	fdata.address = address;
-	fdata.pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT)
-			+ area->vm_pgoff;
-	fdata.flags = 0;
-
-	page = filemap_fault(area, &fdata);
-	if (type)
-		*type = fdata.type;
-
-	return page;
-}
-EXPORT_SYMBOL(filemap_nopage);
-
-static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
-					int nonblock)
-{
-	struct address_space *mapping = file->f_mapping;
-	struct page *page;
-	int error;
-
-	/*
-	 * Do we have something in the page cache already?
-	 */
-retry_find:
-	page = find_get_page(mapping, pgoff);
-	if (!page) {
-		if (nonblock)
-			return NULL;
-		goto no_cached_page;
-	}
-
-	/*
-	 * Ok, found a page in the page cache, now we need to check
-	 * that it's up-to-date.
-	 */
-	if (!PageUptodate(page)) {
-		if (nonblock) {
-			page_cache_release(page);
-			return NULL;
-		}
-		goto page_not_uptodate;
-	}
-
-success:
-	/*
-	 * Found the page and have a reference on it.
-	 */
-	mark_page_accessed(page);
-	return page;
-
-no_cached_page:
-	error = page_cache_read(file, pgoff);
-
-	/*
-	 * The page we want has now been added to the page cache.
-	 * In the unlikely event that someone removed it in the
-	 * meantime, we'll just come back here and read it again.
-	 */
-	if (error >= 0)
-		goto retry_find;
-
-	/*
-	 * An error return from page_cache_read can result if the
-	 * system is low on memory, or a problem occurs while trying
-	 * to schedule I/O.
-	 */
-	return NULL;
-
-page_not_uptodate:
-	lock_page(page);
-
-	/* Did it get truncated while we waited for it? */
-	if (!page->mapping) {
-		unlock_page(page);
-		goto err;
-	}
-
-	/* Did somebody else get it up-to-date? */
-	if (PageUptodate(page)) {
-		unlock_page(page);
-		goto success;
-	}
-
-	error = mapping->a_ops->readpage(file, page);
-	if (!error) {
-		wait_on_page_locked(page);
-		if (PageUptodate(page))
-			goto success;
-	} else if (error == AOP_TRUNCATED_PAGE) {
-		page_cache_release(page);
-		goto retry_find;
-	}
-
-	/*
-	 * Umm, take care of errors if the page isn't up-to-date.
-	 * Try to re-read it _once_. We do this synchronously,
-	 * because there really aren't any performance issues here
-	 * and we need to check for errors.
-	 */
-	lock_page(page);
-
-	/* Somebody truncated the page on us? */
-	if (!page->mapping) {
-		unlock_page(page);
-		goto err;
-	}
-	/* Somebody else successfully read it in? */
-	if (PageUptodate(page)) {
-		unlock_page(page);
-		goto success;
-	}
-
-	ClearPageError(page);
-	error = mapping->a_ops->readpage(file, page);
-	if (!error) {
-		wait_on_page_locked(page);
-		if (PageUptodate(page))
-			goto success;
-	} else if (error == AOP_TRUNCATED_PAGE) {
-		page_cache_release(page);
-		goto retry_find;
-	}
-
-	/*
-	 * Things didn't work out. Return zero to tell the
-	 * mm layer so, possibly freeing the page cache page first.
-	 */
-err:
-	page_cache_release(page);
-
-	return NULL;
-}
-
-int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
-		unsigned long len, pgprot_t prot, unsigned long pgoff,
-		int nonblock)
-{
-	struct file *file = vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	unsigned long size;
-	struct mm_struct *mm = vma->vm_mm;
-	struct page *page;
-	int err;
-
-	if (!nonblock)
-		force_page_cache_readahead(mapping, vma->vm_file,
-					pgoff, len >> PAGE_CACHE_SHIFT);
-
-repeat:
-	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (pgoff + (len >> PAGE_CACHE_SHIFT) > size)
-		return -EINVAL;
-
-	page = filemap_getpage(file, pgoff, nonblock);
-
-	/* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as
-	 * done in shmem_populate calling shmem_getpage */
-	if (!page && !nonblock)
-		return -ENOMEM;
-
-	if (page) {
-		err = install_page(mm, vma, addr, page, prot);
-		if (err) {
-			page_cache_release(page);
-			return err;
-		}
-	} else if (vma->vm_flags & VM_NONLINEAR) {
-		/* No page was found just because we can't read it in now (being
-		 * here implies nonblock != 0), but the page may exist, so set
-		 * the PTE to fault it in later. */
-		err = install_file_pte(mm, vma, addr, pgoff, prot);
-		if (err)
-			return err;
-	}
-
-	len -= PAGE_SIZE;
-	addr += PAGE_SIZE;
-	pgoff++;
-	if (len)
-		goto repeat;
-
-	return 0;
-}
-EXPORT_SYMBOL(filemap_populate);
-
 struct vm_operations_struct generic_file_vm_ops = {
 	.fault		= filemap_fault,
 };
@@ -1682,7 +1483,7 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
 		return -ENOEXEC;
 	file_accessed(file);
 	vma->vm_ops = &generic_file_vm_ops;
-	vma->vm_flags |= VM_CAN_INVALIDATE | VM_CAN_NONLINEAR;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
 
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 82f4b8e9834e..847d5d78163e 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -210,8 +210,7 @@ __xip_unmap (struct address_space * mapping,
  *
  * This function is derived from filemap_fault, but used for execute in place
  */
-static struct page *xip_file_fault(struct vm_area_struct *area,
-					struct fault_data *fdata)
+static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 {
 	struct file *file = area->vm_file;
 	struct address_space *mapping = file->f_mapping;
@@ -222,19 +221,15 @@ static struct page *xip_file_fault(struct vm_area_struct *area,
 	/* XXX: are VM_FAULT_ codes OK? */
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (fdata->pgoff >= size) {
-		fdata->type = VM_FAULT_SIGBUS;
-		return NULL;
-	}
+	if (vmf->pgoff >= size)
+		return VM_FAULT_SIGBUS;
 
 	page = mapping->a_ops->get_xip_page(mapping,
-					fdata->pgoff*(PAGE_SIZE/512), 0);
+					vmf->pgoff*(PAGE_SIZE/512), 0);
 	if (!IS_ERR(page))
 		goto out;
-	if (PTR_ERR(page) != -ENODATA) {
-		fdata->type = VM_FAULT_OOM;
-		return NULL;
-	}
+	if (PTR_ERR(page) != -ENODATA)
+		return VM_FAULT_OOM;
 
 	/* sparse block */
 	if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
@@ -242,26 +237,22 @@ static struct page *xip_file_fault(struct vm_area_struct *area,
 	    (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
 		/* maybe shared writable, allocate new block */
 		page = mapping->a_ops->get_xip_page(mapping,
-					fdata->pgoff*(PAGE_SIZE/512), 1);
-		if (IS_ERR(page)) {
-			fdata->type = VM_FAULT_SIGBUS;
-			return NULL;
-		}
+					vmf->pgoff*(PAGE_SIZE/512), 1);
+		if (IS_ERR(page))
+			return VM_FAULT_SIGBUS;
 		/* unmap page at pgoff from all other vmas */
-		__xip_unmap(mapping, fdata->pgoff);
+		__xip_unmap(mapping, vmf->pgoff);
 	} else {
 		/* not shared and writable, use xip_sparse_page() */
 		page = xip_sparse_page();
-		if (!page) {
-			fdata->type = VM_FAULT_OOM;
-			return NULL;
-		}
+		if (!page)
+			return VM_FAULT_OOM;
 	}
 
 out:
-	fdata->type = VM_FAULT_MINOR;
 	page_cache_get(page);
-	return page;
+	vmf->page = page;
+	return VM_FAULT_MINOR;
 }
 
 static struct vm_operations_struct xip_file_vm_ops = {
diff --git a/mm/fremap.c b/mm/fremap.c
index 01e51f01b84e..5f50d736a037 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -20,13 +20,14 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long addr, pte_t *ptep)
 {
 	pte_t pte = *ptep;
-	struct page *page = NULL;
 
 	if (pte_present(pte)) {
+		struct page *page;
+
 		flush_cache_page(vma, addr, pte_pfn(pte));
 		pte = ptep_clear_flush(vma, addr, ptep);
 		page = vm_normal_page(vma, addr, pte);
@@ -35,68 +36,21 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 				set_page_dirty(page);
 			page_remove_rmap(page, vma);
 			page_cache_release(page);
+			update_hiwater_rss(mm);
+			dec_mm_counter(mm, file_rss);
 		}
 	} else {
 		if (!pte_file(pte))
 			free_swap_and_cache(pte_to_swp_entry(pte));
 		pte_clear_not_present_full(mm, addr, ptep, 0);
 	}
-	return !!page;
 }
 
-/*
- * Install a file page to a given virtual memory address, release any
- * previously existing mapping.
- */
-int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long addr, struct page *page, pgprot_t prot)
-{
-	struct inode *inode;
-	pgoff_t size;
-	int err = -ENOMEM;
-	pte_t *pte;
-	pte_t pte_val;
-	spinlock_t *ptl;
-
-	pte = get_locked_pte(mm, addr, &ptl);
-	if (!pte)
-		goto out;
-
-	/*
-	 * This page may have been truncated. Tell the
-	 * caller about it.
-	 */
-	err = -EINVAL;
-	inode = vma->vm_file->f_mapping->host;
-	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (!page->mapping || page->index >= size)
-		goto unlock;
-	err = -ENOMEM;
-	if (page_mapcount(page) > INT_MAX/2)
-		goto unlock;
-
-	if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
-		inc_mm_counter(mm, file_rss);
-
-	flush_icache_page(vma, page);
-	pte_val = mk_pte(page, prot);
-	set_pte_at(mm, addr, pte, pte_val);
-	page_add_file_rmap(page);
-	update_mmu_cache(vma, addr, pte_val);
-	lazy_mmu_prot_update(pte_val);
-	err = 0;
-unlock:
-	pte_unmap_unlock(pte, ptl);
-out:
-	return err;
-}
-EXPORT_SYMBOL(install_page);
-
 /*
  * Install a file pte to a given virtual memory address, release any
  * previously existing mapping.
  */
-int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long addr, unsigned long pgoff, pgprot_t prot)
 {
 	int err = -ENOMEM;
@@ -107,10 +61,8 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!pte)
 		goto out;
 
-	if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
-		update_hiwater_rss(mm);
-		dec_mm_counter(mm, file_rss);
-	}
+	if (!pte_none(*pte))
+		zap_pte(mm, vma, addr, pte);
 
 	set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
 	/*
@@ -208,8 +160,7 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
 	if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
 		goto out;
 
-	if ((!vma->vm_ops || !vma->vm_ops->populate) &&
-					!(vma->vm_flags & VM_CAN_NONLINEAR))
+	if (!vma->vm_flags & VM_CAN_NONLINEAR)
 		goto out;
 
 	if (end <= start || start < vma->vm_start || end > vma->vm_end)
@@ -239,18 +190,14 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
 		spin_unlock(&mapping->i_mmap_lock);
 	}
 
-	if (vma->vm_flags & VM_CAN_NONLINEAR) {
-		err = populate_range(mm, vma, start, size, pgoff);
-		if (!err && !(flags & MAP_NONBLOCK)) {
-			if (unlikely(has_write_lock)) {
-				downgrade_write(&mm->mmap_sem);
-				has_write_lock = 0;
-			}
-			make_pages_present(start, start+size);
+	err = populate_range(mm, vma, start, size, pgoff);
+	if (!err && !(flags & MAP_NONBLOCK)) {
+		if (unlikely(has_write_lock)) {
+			downgrade_write(&mm->mmap_sem);
+			has_write_lock = 0;
 		}
-	} else
-		err = vma->vm_ops->populate(vma, start, size, vma->vm_page_prot,
-					    	pgoff, flags & MAP_NONBLOCK);
+		make_pages_present(start, start+size);
+	}
 
 	/*
 	 * We can't clear VM_NONLINEAR because we'd have to do
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6912bbf33faa..aaa7c1a682d9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -316,15 +316,14 @@ unsigned long hugetlb_total_pages(void)
  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
  * this far.
  */
-static struct page *hugetlb_nopage(struct vm_area_struct *vma,
-				unsigned long address, int *unused)
+static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	BUG();
-	return NULL;
+	return 0;
 }
 
 struct vm_operations_struct hugetlb_vm_ops = {
-	.nopage = hugetlb_nopage,
+	.fault = hugetlb_vm_op_fault,
 };
 
 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
diff --git a/mm/memory.c b/mm/memory.c
index 7abd3899848b..23c870479b3e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1834,10 +1834,10 @@ static int unmap_mapping_range_vma(struct vm_area_struct *vma,
 
 	/*
 	 * files that support invalidating or truncating portions of the
-	 * file from under mmaped areas must set the VM_CAN_INVALIDATE flag, and
-	 * have their .nopage function return the page locked.
+	 * file from under mmaped areas must have their ->fault function
+	 * return a locked page (and FAULT_RET_LOCKED code). This provides
+	 * synchronisation against concurrent unmapping here.
 	 */
-	BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE));
 
 again:
 	restart_addr = vma->vm_truncate_count;
@@ -2306,63 +2306,62 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
 {
 	spinlock_t *ptl;
-	struct page *page, *faulted_page;
+	struct page *page;
 	pte_t entry;
 	int anon = 0;
 	struct page *dirty_page = NULL;
-	struct fault_data fdata;
+	struct vm_fault vmf;
+	int ret;
 
-	fdata.address = address & PAGE_MASK;
-	fdata.pgoff = pgoff;
-	fdata.flags = flags;
+	vmf.virtual_address = (void __user *)(address & PAGE_MASK);
+	vmf.pgoff = pgoff;
+	vmf.flags = flags;
+	vmf.page = NULL;
 
 	pte_unmap(page_table);
 	BUG_ON(vma->vm_flags & VM_PFNMAP);
 
 	if (likely(vma->vm_ops->fault)) {
-		fdata.type = -1;
-		faulted_page = vma->vm_ops->fault(vma, &fdata);
-		WARN_ON(fdata.type == -1);
-		if (unlikely(!faulted_page))
-			return fdata.type;
+		ret = vma->vm_ops->fault(vma, &vmf);
+		if (unlikely(ret & (VM_FAULT_ERROR | FAULT_RET_NOPAGE)))
+			return (ret & VM_FAULT_MASK);
 	} else {
 		/* Legacy ->nopage path */
-		fdata.type = VM_FAULT_MINOR;
-		faulted_page = vma->vm_ops->nopage(vma, address & PAGE_MASK,
-								&fdata.type);
+		ret = VM_FAULT_MINOR;
+		vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
 		/* no page was available -- either SIGBUS or OOM */
-		if (unlikely(faulted_page == NOPAGE_SIGBUS))
+		if (unlikely(vmf.page == NOPAGE_SIGBUS))
 			return VM_FAULT_SIGBUS;
-		else if (unlikely(faulted_page == NOPAGE_OOM))
+		else if (unlikely(vmf.page == NOPAGE_OOM))
 			return VM_FAULT_OOM;
 	}
 
 	/*
-	 * For consistency in subsequent calls, make the faulted_page always
+	 * For consistency in subsequent calls, make the faulted page always
 	 * locked.
 	 */
-	if (unlikely(!(vma->vm_flags & VM_CAN_INVALIDATE)))
-		lock_page(faulted_page);
+	if (unlikely(!(ret & FAULT_RET_LOCKED)))
+		lock_page(vmf.page);
 	else
-		BUG_ON(!PageLocked(faulted_page));
+		VM_BUG_ON(!PageLocked(vmf.page));
 
 	/*
 	 * Should we do an early C-O-W break?
 	 */
-	page = faulted_page;
+	page = vmf.page;
 	if (flags & FAULT_FLAG_WRITE) {
 		if (!(vma->vm_flags & VM_SHARED)) {
 			anon = 1;
 			if (unlikely(anon_vma_prepare(vma))) {
-				fdata.type = VM_FAULT_OOM;
+				ret = VM_FAULT_OOM;
 				goto out;
 			}
 			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 			if (!page) {
-				fdata.type = VM_FAULT_OOM;
+				ret = VM_FAULT_OOM;
 				goto out;
 			}
-			copy_user_highpage(page, faulted_page, address, vma);
+			copy_user_highpage(page, vmf.page, address, vma);
 		} else {
 			/*
 			 * If the page will be shareable, see if the backing
@@ -2372,11 +2371,23 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			if (vma->vm_ops->page_mkwrite) {
 				unlock_page(page);
 				if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
-					fdata.type = VM_FAULT_SIGBUS;
-					anon = 1; /* no anon but release faulted_page */
+					ret = VM_FAULT_SIGBUS;
+					anon = 1; /* no anon but release vmf.page */
 					goto out_unlocked;
 				}
 				lock_page(page);
+				/*
+				 * XXX: this is not quite right (racy vs
+				 * invalidate) to unlock and relock the page
+				 * like this, however a better fix requires
+				 * reworking page_mkwrite locking API, which
+				 * is better done later.
+				 */
+				if (!page->mapping) {
+					ret = VM_FAULT_MINOR;
+					anon = 1; /* no anon but release vmf.page */
+					goto out;
+				}
 			}
 		}
 
@@ -2427,16 +2438,16 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_unmap_unlock(page_table, ptl);
 
 out:
-	unlock_page(faulted_page);
+	unlock_page(vmf.page);
 out_unlocked:
 	if (anon)
-		page_cache_release(faulted_page);
+		page_cache_release(vmf.page);
 	else if (dirty_page) {
 		set_page_dirty_balance(dirty_page);
 		put_page(dirty_page);
 	}
 
-	return fdata.type;
+	return (ret & VM_FAULT_MASK);
 }
 
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -2447,18 +2458,10 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			- vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff;
 	unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
 
-	return __do_fault(mm, vma, address, page_table, pmd, pgoff, flags, orig_pte);
+	return __do_fault(mm, vma, address, page_table, pmd, pgoff,
+							flags, orig_pte);
 }
 
-static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		int write_access, pgoff_t pgoff, pte_t orig_pte)
-{
-	unsigned int flags = FAULT_FLAG_NONLINEAR |
-				(write_access ? FAULT_FLAG_WRITE : 0);
-
-	return __do_fault(mm, vma, address, page_table, pmd, pgoff, flags, orig_pte);
-}
 
 /*
  * do_no_pfn() tries to create a new page mapping for a page without
@@ -2519,17 +2522,19 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
-static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
+static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		int write_access, pte_t orig_pte)
 {
+	unsigned int flags = FAULT_FLAG_NONLINEAR |
+				(write_access ? FAULT_FLAG_WRITE : 0);
 	pgoff_t pgoff;
-	int err;
 
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		return VM_FAULT_MINOR;
 
-	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
+	if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
+			!(vma->vm_flags & VM_CAN_NONLINEAR))) {
 		/*
 		 * Page table corrupted: show pte and kill process.
 		 */
@@ -2539,18 +2544,8 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	pgoff = pte_to_pgoff(orig_pte);
 
-	if (vma->vm_ops && vma->vm_ops->fault)
-		return do_nonlinear_fault(mm, vma, address, page_table, pmd,
-					write_access, pgoff, orig_pte);
-
-	/* We can then assume vm->vm_ops && vma->vm_ops->populate */
-	err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
-					vma->vm_page_prot, pgoff, 0);
-	if (err == -ENOMEM)
-		return VM_FAULT_OOM;
-	if (err)
-		return VM_FAULT_SIGBUS;
-	return VM_FAULT_MAJOR;
+	return __do_fault(mm, vma, address, page_table, pmd, pgoff,
+							flags, orig_pte);
 }
 
 /*
@@ -2588,7 +2583,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 						 pte, pmd, write_access);
 		}
 		if (pte_file(entry))
-			return do_file_page(mm, vma, address,
+			return do_nonlinear_fault(mm, vma, address,
 					pte, pmd, write_access, entry);
 		return do_swap_page(mm, vma, address,
 					pte, pmd, write_access, entry);
diff --git a/mm/nommu.c b/mm/nommu.c
index aee0e1b0ebe7..1b105d28949f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1341,10 +1341,10 @@ int in_gate_area_no_task(unsigned long addr)
 	return 0;
 }
 
-struct page *filemap_fault(struct vm_area_struct *vma, struct fault_data *fdata)
+int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	BUG();
-	return NULL;
+	return 0;
 }
 
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index 6b44440f1b24..0a555af8733d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1309,29 +1309,21 @@ failed:
 	return error;
 }
 
-static struct page *shmem_fault(struct vm_area_struct *vma,
-					struct fault_data *fdata)
+static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
-	struct page *page = NULL;
 	int error;
+	int ret;
 
-	BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE));
+	if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+		return VM_FAULT_SIGBUS;
 
-	if (((loff_t)fdata->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
-		fdata->type = VM_FAULT_SIGBUS;
-		return NULL;
-	}
-
-	error = shmem_getpage(inode, fdata->pgoff, &page,
-						SGP_FAULT, &fdata->type);
-	if (error) {
-		fdata->type = ((error == -ENOMEM)?VM_FAULT_OOM:VM_FAULT_SIGBUS);
-		return NULL;
-	}
+	error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_FAULT, &ret);
+	if (error)
+		return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
 
-	mark_page_accessed(page);
-	return page;
+	mark_page_accessed(vmf->page);
+	return ret | FAULT_RET_LOCKED;
 }
 
 #ifdef CONFIG_NUMA
@@ -1378,7 +1370,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	file_accessed(file);
 	vma->vm_ops = &shmem_vm_ops;
-	vma->vm_flags |= VM_CAN_INVALIDATE | VM_CAN_NONLINEAR;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
 
@@ -2560,6 +2552,5 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 		fput(vma->vm_file);
 	vma->vm_file = file;
 	vma->vm_ops = &shmem_vm_ops;
-	vma->vm_flags |= VM_CAN_INVALIDATE;
 	return 0;
 }
-- 
cgit v1.3-8-gc7d7


From 83c54070ee1a2d05c89793884bea1a03f2851ed4 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Thu, 19 Jul 2007 01:47:05 -0700
Subject: mm: fault feedback #2

This patch completes Linus's wish that the fault return codes be made into
bit flags, which I agree makes everything nicer.  This requires requires
all handle_mm_fault callers to be modified (possibly the modifications
should go further and do things like fault accounting in handle_mm_fault --
however that would be for another patch).

[akpm@linux-foundation.org: fix alpha build]
[akpm@linux-foundation.org: fix s390 build]
[akpm@linux-foundation.org: fix sparc build]
[akpm@linux-foundation.org: fix sparc64 build]
[akpm@linux-foundation.org: fix ia64 build]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ian Molton <spyro@f2s.com>
Cc: Bryan Wu <bryan.wu@analog.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Greg Ungerer <gerg@uclinux.org>
Cc: Matthew Wilcox <willy@debian.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
Cc: Richard Curnow <rc@rc0.org.uk>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp>
Cc: Chris Zankel <chris@zankel.net>
Acked-by: Kyle McMartin <kyle@mcmartin.ca>
Acked-by: Haavard Skinnemoen <hskinnemoen@atmel.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[ Still apparently needs some ARM and PPC loving - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/mm/fault.c                     | 22 ++++-----
 arch/arm/mm/fault.c                       | 36 +++++++-------
 arch/arm26/mm/fault.c                     | 30 ++++++------
 arch/avr32/mm/fault.c                     | 23 +++++----
 arch/cris/mm/fault.c                      | 23 ++++-----
 arch/frv/mm/fault.c                       | 23 ++++-----
 arch/i386/mm/fault.c                      | 23 +++++----
 arch/ia64/mm/fault.c                      | 26 +++++-----
 arch/m32r/mm/fault.c                      | 23 +++++----
 arch/m68k/mm/fault.c                      | 21 ++++----
 arch/mips/mm/fault.c                      | 23 +++++----
 arch/parisc/mm/fault.c                    | 23 ++++-----
 arch/powerpc/mm/fault.c                   | 26 +++++-----
 arch/powerpc/platforms/cell/spufs/fault.c | 28 +++++------
 arch/ppc/mm/fault.c                       | 23 +++++----
 arch/s390/lib/uaccess_pt.c                | 23 +++++----
 arch/s390/mm/fault.c                      | 30 ++++++------
 arch/sh/mm/fault.c                        | 23 +++++----
 arch/sh64/mm/fault.c                      | 24 +++++-----
 arch/sparc/mm/fault.c                     | 22 ++++-----
 arch/sparc64/mm/fault.c                   | 24 +++++-----
 arch/um/kernel/trap.c                     | 29 +++++------
 arch/x86_64/mm/fault.c                    | 25 +++++-----
 arch/xtensa/mm/fault.c                    | 23 +++++----
 fs/gfs2/ops_vm.c                          | 11 +++--
 include/linux/mm.h                        | 58 +++++-----------------
 kernel/futex.c                            | 21 ++++----
 mm/filemap.c                              |  6 +--
 mm/filemap_xip.c                          |  2 +-
 mm/hugetlb.c                              | 10 ++--
 mm/memory.c                               | 80 +++++++++++++++----------------
 mm/shmem.c                                |  8 ++--
 32 files changed, 373 insertions(+), 419 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index f5862792a167..a0e18da594d9 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -148,21 +148,17 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 	   the fault.  */
 	fault = handle_mm_fault(mm, vma, address, cause > 0);
 	up_read(&mm->mmap_sem);
-
-	switch (fault) {
-	      case VM_FAULT_MINOR:
-		current->min_flt++;
-		break;
-	      case VM_FAULT_MAJOR:
-		current->maj_flt++;
-		break;
-	      case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	      case VM_FAULT_OOM:
-		goto out_of_memory;
-	      default:
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 	return;
 
 	/* Something tried to access memory that isn't in our memory map.
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 75d491448e45..c04124a095cf 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -183,20 +183,20 @@ good_area:
 	 */
 survive:
 	fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, fsr & (1 << 11));
-
-	/*
-	 * Handle the "normal" cases first - successful and sigbus
-	 */
-	switch (fault) {
-	case VM_FAULT_MAJOR:
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			return fault;
+		BUG();
+	}
+	if (fault & VM_FAULT_MAJOR)
 		tsk->maj_flt++;
-		return fault;
-	case VM_FAULT_MINOR:
+	else
 		tsk->min_flt++;
-	case VM_FAULT_SIGBUS:
-		return fault;
-	}
+	return fault;
 
+out_of_memory:
 	if (!is_init(tsk))
 		goto out;
 
@@ -249,7 +249,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	/*
 	 * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR
 	 */
-	if (fault >= VM_FAULT_MINOR)
+	if (likely(!(fault & VM_FAULT_ERROR)))
 		return 0;
 
 	/*
@@ -259,8 +259,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	if (!user_mode(regs))
 		goto no_context;
 
-	switch (fault) {
-	case VM_FAULT_OOM:
+	if (fault & VM_FAULT_OOM) {
 		/*
 		 * We ran out of memory, or some other thing
 		 * happened to us that made us unable to handle
@@ -269,17 +268,15 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 		printk("VM: killing process %s\n", tsk->comm);
 		do_exit(SIGKILL);
 		return 0;
-
-	case VM_FAULT_SIGBUS:
+	}
+	if (fault & VM_FAULT_SIGBUS) {
 		/*
 		 * We had some memory, but were unable to
 		 * successfully fix up this page fault.
 		 */
 		sig = SIGBUS;
 		code = BUS_ADRERR;
-		break;
-
-	default:
+	} else {
 		/*
 		 * Something tried to access memory that
 		 * isn't in our memory map..
@@ -287,7 +284,6 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 		sig = SIGSEGV;
 		code = fault == VM_FAULT_BADACCESS ?
 			SEGV_ACCERR : SEGV_MAPERR;
-		break;
 	}
 
 	__do_user_fault(tsk, addr, fsr, sig, code, regs);
diff --git a/arch/arm26/mm/fault.c b/arch/arm26/mm/fault.c
index 93c0cee0fb5e..dec638a0c8d9 100644
--- a/arch/arm26/mm/fault.c
+++ b/arch/arm26/mm/fault.c
@@ -170,20 +170,20 @@ good_area:
 	 */
 survive:
 	fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, DO_COW(fsr));
-
-	/*
-	 * Handle the "normal" cases first - successful and sigbus
-	 */
-	switch (fault) {
-	case VM_FAULT_MAJOR:
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			return fault;
+		BUG();
+	}
+	if (fault & VM_FAULT_MAJOR)
 		tsk->maj_flt++;
-		return fault;
-	case VM_FAULT_MINOR:
+	else
 		tsk->min_flt++;
-	case VM_FAULT_SIGBUS:
-		return fault;
-	}
+	return fault;
 
+out_of_memory:
 	fault = -3; /* out of memory */
 	if (!is_init(tsk))
 		goto out;
@@ -225,13 +225,11 @@ int do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	/*
 	 * Handle the "normal" case first
 	 */
-	switch (fault) {
-	case VM_FAULT_MINOR:
-	case VM_FAULT_MAJOR:
+	if (likely(!(fault & VM_FAULT_ERROR)))
 		return 0;
-	case VM_FAULT_SIGBUS:
+	if (fault & VM_FAULT_SIGBUS)
 		goto do_sigbus;
-	}
+	/* else VM_FAULT_OOM */
 
 	/*
 	 * If we are in kernel mode at this point, we
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index 4b2495285d94..ae2d2c593b2b 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -64,6 +64,7 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
 	int writeaccess;
 	long signr;
 	int code;
+	int fault;
 
 	if (notify_page_fault(regs, ecr))
 		return;
@@ -132,20 +133,18 @@ good_area:
 	 * fault.
 	 */
 survive:
-	switch (handle_mm_fault(mm, vma, address, writeaccess)) {
-	case VM_FAULT_MINOR:
-		tsk->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		tsk->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	case VM_FAULT_OOM:
-		goto out_of_memory;
-	default:
+	fault = handle_mm_fault(mm, vma, address, writeaccess);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
 
 	up_read(&mm->mmap_sem);
 	return;
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index c73e91f1299a..8672ab7d7978 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -179,6 +179,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	struct mm_struct *mm;
 	struct vm_area_struct * vma;
 	siginfo_t info;
+	int fault;
 
         D(printk("Page fault for %lX on %X at %lX, prot %d write %d\n",
                  address, smp_processor_id(), instruction_pointer(regs),
@@ -283,18 +284,18 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	 * the fault.
 	 */
 
-	switch (handle_mm_fault(mm, vma, address, writeaccess & 1)) {
-	case VM_FAULT_MINOR:
-		tsk->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		tsk->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	default:
-		goto out_of_memory;
+	fault = handle_mm_fault(mm, vma, address, writeaccess & 1);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
 
 	up_read(&mm->mmap_sem);
 	return;
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index 3f12296c3688..6798fa0257b1 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -40,6 +40,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	pud_t *pue;
 	pte_t *pte;
 	int write;
+	int fault;
 
 #if 0
 	const char *atxc[16] = {
@@ -162,18 +163,18 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	switch (handle_mm_fault(mm, vma, ear0, write)) {
-	case VM_FAULT_MINOR:
-		current->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		current->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	default:
-		goto out_of_memory;
+	fault = handle_mm_fault(mm, vma, ear0, write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 
 	up_read(&mm->mmap_sem);
 	return;
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index 1ecb3e43b523..e92a10124935 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -303,6 +303,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
 	struct vm_area_struct * vma;
 	unsigned long address;
 	int write, si_code;
+	int fault;
 
 	/* get the address */
         address = read_cr2();
@@ -422,20 +423,18 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	switch (handle_mm_fault(mm, vma, address, write)) {
-		case VM_FAULT_MINOR:
-			tsk->min_flt++;
-			break;
-		case VM_FAULT_MAJOR:
-			tsk->maj_flt++;
-			break;
-		case VM_FAULT_SIGBUS:
-			goto do_sigbus;
-		case VM_FAULT_OOM:
+	fault = handle_mm_fault(mm, vma, address, write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
-		default:
-			BUG();
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
 
 	/*
 	 * Did it hit the DOS screen memory VA from vm86 mode?
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index b87f785c2416..73ccb6010c05 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -80,6 +80,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	struct mm_struct *mm = current->mm;
 	struct siginfo si;
 	unsigned long mask;
+	int fault;
 
 	/* mmap_sem is performance critical.... */
 	prefetchw(&mm->mmap_sem);
@@ -147,26 +148,25 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	 * sure we exit gracefully rather than endlessly redo the
 	 * fault.
 	 */
-	switch (handle_mm_fault(mm, vma, address, (mask & VM_WRITE) != 0)) {
-	      case VM_FAULT_MINOR:
-		++current->min_flt;
-		break;
-	      case VM_FAULT_MAJOR:
-		++current->maj_flt;
-		break;
-	      case VM_FAULT_SIGBUS:
+	fault = handle_mm_fault(mm, vma, address, (mask & VM_WRITE) != 0);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
 		/*
 		 * We ran out of memory, or some other thing happened
 		 * to us that made us unable to handle the page fault
 		 * gracefully.
 		 */
-		signal = SIGBUS;
-		goto bad_area;
-	      case VM_FAULT_OOM:
-		goto out_of_memory;
-	      default:
+		if (fault & VM_FAULT_OOM) {
+			goto out_of_memory;
+		} else if (fault & VM_FAULT_SIGBUS) {
+			signal = SIGBUS;
+			goto bad_area;
+		}
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 	up_read(&mm->mmap_sem);
 	return;
 
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index f3935ba24946..676a1c443d28 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -80,6 +80,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	struct vm_area_struct * vma;
 	unsigned long page, addr;
 	int write;
+	int fault;
 	siginfo_t info;
 
 	/*
@@ -195,20 +196,18 @@ survive:
 	 */
 	addr = (address & PAGE_MASK);
 	set_thread_fault_code(error_code);
-	switch (handle_mm_fault(mm, vma, addr, write)) {
-		case VM_FAULT_MINOR:
-			tsk->min_flt++;
-			break;
-		case VM_FAULT_MAJOR:
-			tsk->maj_flt++;
-			break;
-		case VM_FAULT_SIGBUS:
-			goto do_sigbus;
-		case VM_FAULT_OOM:
+	fault = handle_mm_fault(mm, vma, addr, write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
-		default:
-			BUG();
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
 	set_thread_fault_code(0);
 	up_read(&mm->mmap_sem);
 	return;
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index 2adbeb16e1b8..578b48f47b9e 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -159,18 +159,17 @@ good_area:
 #ifdef DEBUG
 	printk("handle_mm_fault returns %d\n",fault);
 #endif
-	switch (fault) {
-	case VM_FAULT_MINOR:
-		current->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		current->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto bus_err;
-	default:
-		goto out_of_memory;
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto bus_err;
+		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 
 	up_read(&mm->mmap_sem);
 	return 0;
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 7ebea331edb8..521771b373de 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -39,6 +39,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
 	struct mm_struct *mm = tsk->mm;
 	const int field = sizeof(unsigned long) * 2;
 	siginfo_t info;
+	int fault;
 
 #if 0
 	printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", raw_smp_processor_id(),
@@ -102,20 +103,18 @@ survive:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	switch (handle_mm_fault(mm, vma, address, write)) {
-	case VM_FAULT_MINOR:
-		tsk->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		tsk->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	case VM_FAULT_OOM:
-		goto out_of_memory;
-	default:
+	fault = handle_mm_fault(mm, vma, address, write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
 
 	up_read(&mm->mmap_sem);
 	return;
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index f6f67554c623..7899ab87785a 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -147,6 +147,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
 	struct mm_struct *mm = tsk->mm;
 	const struct exception_table_entry *fix;
 	unsigned long acc_type;
+	int fault;
 
 	if (in_atomic() || !mm)
 		goto no_context;
@@ -173,23 +174,23 @@ good_area:
 	 * fault.
 	 */
 
-	switch (handle_mm_fault(mm, vma, address, (acc_type & VM_WRITE) != 0)) {
-	      case VM_FAULT_MINOR:
-		++current->min_flt;
-		break;
-	      case VM_FAULT_MAJOR:
-		++current->maj_flt;
-		break;
-	      case VM_FAULT_SIGBUS:
+	fault = handle_mm_fault(mm, vma, address, (acc_type & VM_WRITE) != 0);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
 		/*
 		 * We hit a shared mapping outside of the file, or some
 		 * other thing happened to us that made us unable to
 		 * handle the page fault gracefully.
 		 */
-		goto bad_area;
-	      default:
-		goto out_of_memory;
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto bad_area;
+		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 	up_read(&mm->mmap_sem);
 	return;
 
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 0ece51310bfe..3767211b3d0f 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -145,7 +145,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	struct mm_struct *mm = current->mm;
 	siginfo_t info;
 	int code = SEGV_MAPERR;
-	int is_write = 0;
+	int is_write = 0, ret;
 	int trap = TRAP(regs);
  	int is_exec = trap == 0x400;
 
@@ -330,22 +330,18 @@ good_area:
 	 * the fault.
 	 */
  survive:
-	switch (handle_mm_fault(mm, vma, address, is_write)) {
-
-	case VM_FAULT_MINOR:
-		current->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		current->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	case VM_FAULT_OOM:
-		goto out_of_memory;
-	default:
+	ret = handle_mm_fault(mm, vma, address, is_write);
+	if (unlikely(ret & VM_FAULT_ERROR)) {
+		if (ret & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (ret & VM_FAULT_SIGBUS)
+			goto do_sigbus;
 		BUG();
 	}
-
+	if (ret & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 	up_read(&mm->mmap_sem);
 	return 0;
 
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c
index e064d0c0d80e..07f88de0544d 100644
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -74,23 +74,21 @@ good_area:
 			goto bad_area;
 	}
 	ret = 0;
-	*flt = handle_mm_fault(mm, vma, ea, is_write);
-	switch (*flt) {
-	case VM_FAULT_MINOR:
-		current->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		current->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		ret = -EFAULT;
-		goto bad_area;
-	case VM_FAULT_OOM:
-		ret = -ENOMEM;
-		goto bad_area;
-	default:
+	fault = handle_mm_fault(mm, vma, ea, is_write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM) {
+			ret = -ENOMEM;
+			goto bad_area;
+		} else if (fault & VM_FAULT_SIGBUS) {
+			ret = -EFAULT;
+			goto bad_area;
+		}
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 	up_read(&mm->mmap_sem);
 	return ret;
 
diff --git a/arch/ppc/mm/fault.c b/arch/ppc/mm/fault.c
index 465f451f3bc3..b98244e277fb 100644
--- a/arch/ppc/mm/fault.c
+++ b/arch/ppc/mm/fault.c
@@ -96,6 +96,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	struct mm_struct *mm = current->mm;
 	siginfo_t info;
 	int code = SEGV_MAPERR;
+	int fault;
 #if defined(CONFIG_4xx) || defined (CONFIG_BOOKE)
 	int is_write = error_code & ESR_DST;
 #else
@@ -249,20 +250,18 @@ good_area:
 	 * the fault.
 	 */
  survive:
-        switch (handle_mm_fault(mm, vma, address, is_write)) {
-        case VM_FAULT_MINOR:
-                current->min_flt++;
-                break;
-        case VM_FAULT_MAJOR:
-                current->maj_flt++;
-                break;
-        case VM_FAULT_SIGBUS:
-                goto do_sigbus;
-        case VM_FAULT_OOM:
-                goto out_of_memory;
-	default:
+	fault = handle_mm_fault(mm, vma, address, is_write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 
 	up_read(&mm->mmap_sem);
 	/*
diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c
index 63181671e3e3..60604b2819b2 100644
--- a/arch/s390/lib/uaccess_pt.c
+++ b/arch/s390/lib/uaccess_pt.c
@@ -20,6 +20,7 @@ static int __handle_fault(struct mm_struct *mm, unsigned long address,
 {
 	struct vm_area_struct *vma;
 	int ret = -EFAULT;
+	int fault;
 
 	if (in_atomic())
 		return ret;
@@ -44,20 +45,18 @@ static int __handle_fault(struct mm_struct *mm, unsigned long address,
 	}
 
 survive:
-	switch (handle_mm_fault(mm, vma, address, write_access)) {
-	case VM_FAULT_MINOR:
-		current->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		current->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto out_sigbus;
-	case VM_FAULT_OOM:
-		goto out_of_memory;
-	default:
+	fault = handle_mm_fault(mm, vma, address, write_access);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto out_sigbus;
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 	ret = 0;
 out:
 	up_read(&mm->mmap_sem);
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index d855cdbf8fb8..54055194e9af 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -307,6 +307,7 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write)
 	unsigned long address;
 	int space;
 	int si_code;
+	int fault;
 
 	if (notify_page_fault(regs, error_code))
 		return;
@@ -377,23 +378,22 @@ survive:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	switch (handle_mm_fault(mm, vma, address, write)) {
-	case VM_FAULT_MINOR:
-		tsk->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		tsk->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		do_sigbus(regs, error_code, address);
-		return;
-	case VM_FAULT_OOM:
-		if (do_out_of_memory(regs, error_code, address))
-			goto survive;
-		return;
-	default:
+	fault = handle_mm_fault(mm, vma, address, write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM) {
+			if (do_out_of_memory(regs, error_code, address))
+				goto survive;
+			return;
+		} else if (fault & VM_FAULT_SIGBUS) {
+			do_sigbus(regs, error_code, address);
+			return;
+		}
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
 
         up_read(&mm->mmap_sem);
 	/*
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 0b3eaf6fbb28..964c6767dc73 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -33,6 +33,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	struct mm_struct *mm;
 	struct vm_area_struct * vma;
 	int si_code;
+	int fault;
 	siginfo_t info;
 
 	trace_hardirqs_on();
@@ -124,20 +125,18 @@ good_area:
 	 * the fault.
 	 */
 survive:
-	switch (handle_mm_fault(mm, vma, address, writeaccess)) {
-		case VM_FAULT_MINOR:
-			tsk->min_flt++;
-			break;
-		case VM_FAULT_MAJOR:
-			tsk->maj_flt++;
-			break;
-		case VM_FAULT_SIGBUS:
-			goto do_sigbus;
-		case VM_FAULT_OOM:
+	fault = handle_mm_fault(mm, vma, address, writeaccess);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
-		default:
-			BUG();
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
 
 	up_read(&mm->mmap_sem);
 	return;
diff --git a/arch/sh64/mm/fault.c b/arch/sh64/mm/fault.c
index 3cd93ba5d826..0d069d82141f 100644
--- a/arch/sh64/mm/fault.c
+++ b/arch/sh64/mm/fault.c
@@ -127,6 +127,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
 	struct vm_area_struct * vma;
 	const struct exception_table_entry *fixup;
 	pte_t *pte;
+	int fault;
 
 #if defined(CONFIG_SH64_PROC_TLB)
         ++calls_to_do_slow_page_fault;
@@ -221,18 +222,19 @@ good_area:
 	 * the fault.
 	 */
 survive:
-	switch (handle_mm_fault(mm, vma, address, writeaccess)) {
-	case VM_FAULT_MINOR:
-		tsk->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		tsk->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	default:
-		goto out_of_memory;
+	fault = handle_mm_fault(mm, vma, address, writeaccess);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
+
 	/* If we get here, the page fault has been handled.  Do the TLB refill
 	   now from the newly-setup PTE, to avoid having to fault again right
 	   away on the same instruction. */
diff --git a/arch/sparc/mm/fault.c b/arch/sparc/mm/fault.c
index c3483365db4b..50747fe44356 100644
--- a/arch/sparc/mm/fault.c
+++ b/arch/sparc/mm/fault.c
@@ -226,6 +226,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
 	unsigned long g2;
 	siginfo_t info;
 	int from_user = !(regs->psr & PSR_PS);
+	int fault;
 
 	if(text_fault)
 		address = regs->pc;
@@ -289,19 +290,18 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	switch (handle_mm_fault(mm, vma, address, write)) {
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	case VM_FAULT_OOM:
-		goto out_of_memory;
-	case VM_FAULT_MAJOR:
+	fault = handle_mm_fault(mm, vma, address, write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
+	}
+	if (fault & VM_FAULT_MAJOR)
 		current->maj_flt++;
-		break;
-	case VM_FAULT_MINOR:
-	default:
+	else
 		current->min_flt++;
-		break;
-	}
 	up_read(&mm->mmap_sem);
 	return;
 
diff --git a/arch/sparc64/mm/fault.c b/arch/sparc64/mm/fault.c
index b582024d2199..17123e9ecf78 100644
--- a/arch/sparc64/mm/fault.c
+++ b/arch/sparc64/mm/fault.c
@@ -278,7 +278,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	unsigned int insn = 0;
-	int si_code, fault_code;
+	int si_code, fault_code, fault;
 	unsigned long address, mm_rss;
 
 	fault_code = get_thread_fault_code();
@@ -415,20 +415,18 @@ good_area:
 			goto bad_area;
 	}
 
-	switch (handle_mm_fault(mm, vma, address, (fault_code & FAULT_CODE_WRITE))) {
-	case VM_FAULT_MINOR:
-		current->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		current->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	case VM_FAULT_OOM:
-		goto out_of_memory;
-	default:
+	fault = handle_mm_fault(mm, vma, address, (fault_code & FAULT_CODE_WRITE));
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 
 	up_read(&mm->mmap_sem);
 
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index abab90c3803f..3850d53f79fd 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -76,23 +76,24 @@ good_area:
 		goto out;
 
 	do {
+		int fault;
 survive:
-		switch (handle_mm_fault(mm, vma, address, is_write)){
-		case VM_FAULT_MINOR:
-			current->min_flt++;
-			break;
-		case VM_FAULT_MAJOR:
-			current->maj_flt++;
-			break;
-		case VM_FAULT_SIGBUS:
-			err = -EACCES;
-			goto out;
-		case VM_FAULT_OOM:
-			err = -ENOMEM;
-			goto out_of_memory;
-		default:
+		fault = handle_mm_fault(mm, vma, address, is_write);
+		if (unlikely(fault & VM_FAULT_ERROR)) {
+			if (fault & VM_FAULT_OOM) {
+				err = -ENOMEM;
+				goto out_of_memory;
+			} else if (fault & VM_FAULT_SIGBUS) {
+				err = -EACCES;
+				goto out;
+			}
 			BUG();
 		}
+		if (fault & VM_FAULT_MAJOR)
+			current->maj_flt++;
+		else
+			current->min_flt++;
+
 		pgd = pgd_offset(mm, address);
 		pud = pud_offset(pgd, address);
 		pmd = pmd_offset(pud, address);
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 635e58d443d7..84f11728fc76 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -317,7 +317,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	struct vm_area_struct * vma;
 	unsigned long address;
 	const struct exception_table_entry *fixup;
-	int write;
+	int write, fault;
 	unsigned long flags;
 	siginfo_t info;
 
@@ -450,19 +450,18 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	switch (handle_mm_fault(mm, vma, address, write)) {
-	case VM_FAULT_MINOR:
-		tsk->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		tsk->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	default:
-		goto out_of_memory;
+	fault = handle_mm_fault(mm, vma, address, write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
 	}
-
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
 	up_read(&mm->mmap_sem);
 	return;
 
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 3dc6f2f07bbe..16004067add3 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -41,6 +41,7 @@ void do_page_fault(struct pt_regs *regs)
 	siginfo_t info;
 
 	int is_write, is_exec;
+	int fault;
 
 	info.si_code = SEGV_MAPERR;
 
@@ -102,20 +103,18 @@ good_area:
 	 * the fault.
 	 */
 survive:
-	switch (handle_mm_fault(mm, vma, address, is_write)) {
-	case VM_FAULT_MINOR:
-		current->min_flt++;
-		break;
-	case VM_FAULT_MAJOR:
-		current->maj_flt++;
-		break;
-	case VM_FAULT_SIGBUS:
-		goto do_sigbus;
-	case VM_FAULT_OOM:
-		goto out_of_memory;
-	default:
+	fault = handle_mm_fault(mm, vma, address, is_write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
 		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
 
 	up_read(&mm->mmap_sem);
 	return;
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
index dc287d2e3a66..927d739d4685 100644
--- a/fs/gfs2/ops_vm.c
+++ b/fs/gfs2/ops_vm.c
@@ -112,7 +112,7 @@ static int gfs2_sharewrite_fault(struct vm_area_struct *vma,
 	struct gfs2_holder i_gh;
 	int alloc_required;
 	int error;
-	int ret = VM_FAULT_MINOR;
+	int ret = 0;
 
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
 	if (error)
@@ -132,14 +132,19 @@ static int gfs2_sharewrite_fault(struct vm_area_struct *vma,
 	set_bit(GFF_EXLOCK, &gf->f_flags);
 	ret = filemap_fault(vma, vmf);
 	clear_bit(GFF_EXLOCK, &gf->f_flags);
-	if (ret & (VM_FAULT_ERROR | FAULT_RET_NOPAGE))
+	if (ret & VM_FAULT_ERROR)
 		goto out_unlock;
 
 	if (alloc_required) {
 		/* XXX: do we need to drop page lock around alloc_page_backing?*/
 		error = alloc_page_backing(ip, vmf->page);
 		if (error) {
-			if (ret & FAULT_RET_LOCKED)
+			/*
+			 * VM_FAULT_LOCKED should always be the case for
+			 * filemap_fault, but it may not be in a future
+			 * implementation.
+			 */
+			if (ret & VM_FAULT_LOCKED)
 				unlock_page(vmf->page);
 			page_cache_release(vmf->page);
 			ret = VM_FAULT_OOM;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ff0b8844bd5a..f8e12b3b6110 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -196,25 +196,10 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
 
 
-#define FAULT_RET_NOPAGE	0x0100	/* ->fault did not return a page. This
-					 * can be used if the handler installs
-					 * their own pte.
-					 */
-#define FAULT_RET_LOCKED	0x0200	/* ->fault locked the page, caller must
-					 * unlock after installing the mapping.
-					 * This is used by pagecache in
-					 * particular, where the page lock is
-					 * used to synchronise against truncate
-					 * and invalidate. Mutually exclusive
-					 * with FAULT_RET_NOPAGE.
-					 */
-
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
- * ->fault function. The vma's ->fault is responsible for returning the
- * VM_FAULT_xxx type which occupies the lowest byte of the return code, ORed
- * with FAULT_RET_ flags that occupy the next byte and give details about
- * how the fault was handled.
+ * ->fault function. The vma's ->fault is responsible for returning a bitmask
+ * of VM_FAULT_xxx flags that give details about how the fault was handled.
  *
  * pgoff should be used in favour of virtual_address, if possible. If pgoff
  * is used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get nonlinear
@@ -226,9 +211,9 @@ struct vm_fault {
 	void __user *virtual_address;	/* Faulting virtual address */
 
 	struct page *page;		/* ->fault handlers should return a
-					 * page here, unless FAULT_RET_NOPAGE
+					 * page here, unless VM_FAULT_NOPAGE
 					 * is set (which is also implied by
-					 * VM_FAULT_OOM or SIGBUS).
+					 * VM_FAULT_ERROR).
 					 */
 };
 
@@ -712,26 +697,17 @@ static inline int page_mapped(struct page *page)
  * just gets major/minor fault counters bumped up.
  */
 
-/*
- * VM_FAULT_ERROR is set for the error cases, to make some tests simpler.
- */
-#define VM_FAULT_ERROR	0x20
+#define VM_FAULT_MINOR	0 /* For backwards compat. Remove me quickly. */
 
-#define VM_FAULT_OOM	(0x00 | VM_FAULT_ERROR)
-#define VM_FAULT_SIGBUS	(0x01 | VM_FAULT_ERROR)
-#define VM_FAULT_MINOR	0x02
-#define VM_FAULT_MAJOR	0x03
+#define VM_FAULT_OOM	0x0001
+#define VM_FAULT_SIGBUS	0x0002
+#define VM_FAULT_MAJOR	0x0004
+#define VM_FAULT_WRITE	0x0008	/* Special case for get_user_pages */
 
-/* 
- * Special case for get_user_pages.
- * Must be in a distinct bit from the above VM_FAULT_ flags.
- */
-#define VM_FAULT_WRITE	0x10
+#define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
+#define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
 
-/*
- * Mask of VM_FAULT_ flags
- */
-#define VM_FAULT_MASK	0xff
+#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS)
 
 #define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
 
@@ -817,16 +793,8 @@ extern int vmtruncate(struct inode * inode, loff_t offset);
 extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
 
 #ifdef CONFIG_MMU
-extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma,
+extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
-
-static inline int handle_mm_fault(struct mm_struct *mm,
-			struct vm_area_struct *vma, unsigned long address,
-			int write_access)
-{
-	return __handle_mm_fault(mm, vma, address, write_access) &
-				(~VM_FAULT_WRITE);
-}
 #else
 static inline int handle_mm_fault(struct mm_struct *mm,
 			struct vm_area_struct *vma, unsigned long address,
diff --git a/kernel/futex.c b/kernel/futex.c
index 5c3f45d07c53..a12425051ee9 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -346,15 +346,20 @@ static int futex_handle_fault(unsigned long address,
 	vma = find_vma(mm, address);
 	if (vma && address >= vma->vm_start &&
 	    (vma->vm_flags & VM_WRITE)) {
-		switch (handle_mm_fault(mm, vma, address, 1)) {
-		case VM_FAULT_MINOR:
-			ret = 0;
-			current->min_flt++;
-			break;
-		case VM_FAULT_MAJOR:
+		int fault;
+		fault = handle_mm_fault(mm, vma, address, 1);
+		if (unlikely((fault & VM_FAULT_ERROR))) {
+#if 0
+			/* XXX: let's do this when we verify it is OK */
+			if (ret & VM_FAULT_OOM)
+				ret = -ENOMEM;
+#endif
+		} else {
 			ret = 0;
-			current->maj_flt++;
-			break;
+			if (fault & VM_FAULT_MAJOR)
+				current->maj_flt++;
+			else
+				current->min_flt++;
 		}
 	}
 	if (!fshared)
diff --git a/mm/filemap.c b/mm/filemap.c
index 0876cc57255f..4fd9e3f0f48a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1322,9 +1322,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct page *page;
 	unsigned long size;
 	int did_readaround = 0;
-	int ret;
-
-	ret = VM_FAULT_MINOR;
+	int ret = 0;
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (vmf->pgoff >= size)
@@ -1408,7 +1406,7 @@ retry_find:
 	 */
 	mark_page_accessed(page);
 	vmf->page = page;
-	return ret | FAULT_RET_LOCKED;
+	return ret | VM_FAULT_LOCKED;
 
 outside_data_content:
 	/*
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 847d5d78163e..53ee6a299635 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -252,7 +252,7 @@ static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 out:
 	page_cache_get(page);
 	vmf->page = page;
-	return VM_FAULT_MINOR;
+	return 0;
 }
 
 static struct vm_operations_struct xip_file_vm_ops = {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index aaa7c1a682d9..c4a573b857bd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -469,7 +469,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	avoidcopy = (page_count(old_page) == 1);
 	if (avoidcopy) {
 		set_huge_ptep_writable(vma, address, ptep);
-		return VM_FAULT_MINOR;
+		return 0;
 	}
 
 	page_cache_get(old_page);
@@ -494,7 +494,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 	page_cache_release(new_page);
 	page_cache_release(old_page);
-	return VM_FAULT_MINOR;
+	return 0;
 }
 
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -551,7 +551,7 @@ retry:
 	if (idx >= size)
 		goto backout;
 
-	ret = VM_FAULT_MINOR;
+	ret = 0;
 	if (!pte_none(*ptep))
 		goto backout;
 
@@ -602,7 +602,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		return ret;
 	}
 
-	ret = VM_FAULT_MINOR;
+	ret = 0;
 
 	spin_lock(&mm->page_table_lock);
 	/* Check for a racing update before calling hugetlb_cow */
@@ -641,7 +641,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			spin_unlock(&mm->page_table_lock);
 			ret = hugetlb_fault(mm, vma, vaddr, 0);
 			spin_lock(&mm->page_table_lock);
-			if (ret == VM_FAULT_MINOR)
+			if (!(ret & VM_FAULT_MAJOR))
 				continue;
 
 			remainder = 0;
diff --git a/mm/memory.c b/mm/memory.c
index 23c870479b3e..61d51da7e17c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1068,31 +1068,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			cond_resched();
 			while (!(page = follow_page(vma, start, foll_flags))) {
 				int ret;
-				ret = __handle_mm_fault(mm, vma, start,
+				ret = handle_mm_fault(mm, vma, start,
 						foll_flags & FOLL_WRITE);
+				if (ret & VM_FAULT_ERROR) {
+					if (ret & VM_FAULT_OOM)
+						return i ? i : -ENOMEM;
+					else if (ret & VM_FAULT_SIGBUS)
+						return i ? i : -EFAULT;
+					BUG();
+				}
+				if (ret & VM_FAULT_MAJOR)
+					tsk->maj_flt++;
+				else
+					tsk->min_flt++;
+
 				/*
-				 * The VM_FAULT_WRITE bit tells us that do_wp_page has
-				 * broken COW when necessary, even if maybe_mkwrite
-				 * decided not to set pte_write. We can thus safely do
-				 * subsequent page lookups as if they were reads.
+				 * The VM_FAULT_WRITE bit tells us that
+				 * do_wp_page has broken COW when necessary,
+				 * even if maybe_mkwrite decided not to set
+				 * pte_write. We can thus safely do subsequent
+				 * page lookups as if they were reads.
 				 */
 				if (ret & VM_FAULT_WRITE)
 					foll_flags &= ~FOLL_WRITE;
-				
-				switch (ret & ~VM_FAULT_WRITE) {
-				case VM_FAULT_MINOR:
-					tsk->min_flt++;
-					break;
-				case VM_FAULT_MAJOR:
-					tsk->maj_flt++;
-					break;
-				case VM_FAULT_SIGBUS:
-					return i ? i : -EFAULT;
-				case VM_FAULT_OOM:
-					return i ? i : -ENOMEM;
-				default:
-					BUG();
-				}
+
 				cond_resched();
 			}
 			if (pages) {
@@ -1639,7 +1638,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	struct page *old_page, *new_page;
 	pte_t entry;
-	int reuse = 0, ret = VM_FAULT_MINOR;
+	int reuse = 0, ret = 0;
 	struct page *dirty_page = NULL;
 
 	old_page = vm_normal_page(vma, address, orig_pte);
@@ -1835,8 +1834,8 @@ static int unmap_mapping_range_vma(struct vm_area_struct *vma,
 	/*
 	 * files that support invalidating or truncating portions of the
 	 * file from under mmaped areas must have their ->fault function
-	 * return a locked page (and FAULT_RET_LOCKED code). This provides
-	 * synchronisation against concurrent unmapping here.
+	 * return a locked page (and set VM_FAULT_LOCKED in the return).
+	 * This provides synchronisation against concurrent unmapping here.
 	 */
 
 again:
@@ -2140,7 +2139,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page *page;
 	swp_entry_t entry;
 	pte_t pte;
-	int ret = VM_FAULT_MINOR;
+	int ret = 0;
 
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		goto out;
@@ -2208,8 +2207,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unlock_page(page);
 
 	if (write_access) {
+		/* XXX: We could OR the do_wp_page code with this one? */
 		if (do_wp_page(mm, vma, address,
-				page_table, pmd, ptl, pte) == VM_FAULT_OOM)
+				page_table, pmd, ptl, pte) & VM_FAULT_OOM)
 			ret = VM_FAULT_OOM;
 		goto out;
 	}
@@ -2280,7 +2280,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	lazy_mmu_prot_update(entry);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
-	return VM_FAULT_MINOR;
+	return 0;
 release:
 	page_cache_release(page);
 	goto unlock;
@@ -2323,11 +2323,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	if (likely(vma->vm_ops->fault)) {
 		ret = vma->vm_ops->fault(vma, &vmf);
-		if (unlikely(ret & (VM_FAULT_ERROR | FAULT_RET_NOPAGE)))
-			return (ret & VM_FAULT_MASK);
+		if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+			return ret;
 	} else {
 		/* Legacy ->nopage path */
-		ret = VM_FAULT_MINOR;
+		ret = 0;
 		vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
 		/* no page was available -- either SIGBUS or OOM */
 		if (unlikely(vmf.page == NOPAGE_SIGBUS))
@@ -2340,7 +2340,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * For consistency in subsequent calls, make the faulted page always
 	 * locked.
 	 */
-	if (unlikely(!(ret & FAULT_RET_LOCKED)))
+	if (unlikely(!(ret & VM_FAULT_LOCKED)))
 		lock_page(vmf.page);
 	else
 		VM_BUG_ON(!PageLocked(vmf.page));
@@ -2356,7 +2356,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 				ret = VM_FAULT_OOM;
 				goto out;
 			}
-			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+						vma, address);
 			if (!page) {
 				ret = VM_FAULT_OOM;
 				goto out;
@@ -2384,7 +2385,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 				 * is better done later.
 				 */
 				if (!page->mapping) {
-					ret = VM_FAULT_MINOR;
+					ret = 0;
 					anon = 1; /* no anon but release vmf.page */
 					goto out;
 				}
@@ -2447,7 +2448,7 @@ out_unlocked:
 		put_page(dirty_page);
 	}
 
-	return (ret & VM_FAULT_MASK);
+	return ret;
 }
 
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -2486,7 +2487,6 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	pte_t entry;
 	unsigned long pfn;
-	int ret = VM_FAULT_MINOR;
 
 	pte_unmap(page_table);
 	BUG_ON(!(vma->vm_flags & VM_PFNMAP));
@@ -2498,7 +2498,7 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
 	else if (unlikely(pfn == NOPFN_SIGBUS))
 		return VM_FAULT_SIGBUS;
 	else if (unlikely(pfn == NOPFN_REFAULT))
-		return VM_FAULT_MINOR;
+		return 0;
 
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 
@@ -2510,7 +2510,7 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
 		set_pte_at(mm, address, page_table, entry);
 	}
 	pte_unmap_unlock(page_table, ptl);
-	return ret;
+	return 0;
 }
 
 /*
@@ -2531,7 +2531,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	pgoff_t pgoff;
 
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-		return VM_FAULT_MINOR;
+		return 0;
 
 	if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
 			!(vma->vm_flags & VM_CAN_NONLINEAR))) {
@@ -2615,13 +2615,13 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 	}
 unlock:
 	pte_unmap_unlock(pte, ptl);
-	return VM_FAULT_MINOR;
+	return 0;
 }
 
 /*
  * By the time we get here, we already hold the mm semaphore
  */
-int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, int write_access)
 {
 	pgd_t *pgd;
@@ -2650,7 +2650,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
 }
 
-EXPORT_SYMBOL_GPL(__handle_mm_fault);
+EXPORT_SYMBOL_GPL(handle_mm_fault);
 
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index 0a555af8733d..ad155c7745dc 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1103,7 +1103,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
 		return -EFBIG;
 
 	if (type)
-		*type = VM_FAULT_MINOR;
+		*type = 0;
 
 	/*
 	 * Normally, filepage is NULL on entry, and either found
@@ -1138,9 +1138,9 @@ repeat:
 		if (!swappage) {
 			shmem_swp_unmap(entry);
 			/* here we actually do the io */
-			if (type && *type == VM_FAULT_MINOR) {
+			if (type && !(*type & VM_FAULT_MAJOR)) {
 				__count_vm_event(PGMAJFAULT);
-				*type = VM_FAULT_MAJOR;
+				*type |= VM_FAULT_MAJOR;
 			}
 			spin_unlock(&info->lock);
 			swappage = shmem_swapin(info, swap, idx);
@@ -1323,7 +1323,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
 
 	mark_page_accessed(vmf->page);
-	return ret | FAULT_RET_LOCKED;
+	return ret | VM_FAULT_LOCKED;
 }
 
 #ifdef CONFIG_NUMA
-- 
cgit v1.3-8-gc7d7


From bb2d5ce16409efcdf94017a6b6fecd468226e29c Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Thu, 19 Jul 2007 01:47:23 -0700
Subject: Remove alloc_zeroed_user_highpage()

alloc_zeroed_user_highpage() has no in-tree users and it is not exported.
As it is not exported, it can simply be removed.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/highmem.h | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 12c5e4e3135a..1fcb0033179e 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -102,21 +102,6 @@ __alloc_zeroed_user_highpage(gfp_t movableflags,
 }
 #endif
 
-/**
- * alloc_zeroed_user_highpage - Allocate a zeroed HIGHMEM page for a VMA
- * @vma: The VMA the page is to be allocated for
- * @vaddr: The virtual address the page will be inserted into
- *
- * This function will allocate a page for a VMA that the caller knows will
- * not be able to move in the future using move_pages() or reclaim. If it
- * is known that the page can move, use alloc_zeroed_user_highpage_movable
- */
-static inline struct page *
-alloc_zeroed_user_highpage(struct vm_area_struct *vma, unsigned long vaddr)
-{
-	return __alloc_zeroed_user_highpage(0, vma, vaddr);
-}
-
 /**
  * alloc_zeroed_user_highpage_movable - Allocate a zeroed HIGHMEM page for a VMA that the caller knows can move
  * @vma: The VMA the page is to be allocated for
-- 
cgit v1.3-8-gc7d7


From a634cc10164d1c229fbeca33923e6a0ed939e894 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2007 01:47:30 -0700
Subject: swsusp: introduce restore platform operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

At least on some machines it is necessary to prepare the ACPI firmware for the
restoration of the system memory state from the hibernation image if the
"platform" mode of hibernation has been used.  Namely, in that cases we need
to disable the GPEs before replacing the "boot" kernel with the "frozen"
kernel (cf.  http://bugzilla.kernel.org/show_bug.cgi?id=7887).  After the
restore they will be re-enabled by hibernation_ops->finish(), but if the
restore fails, they have to be re-enabled by the restore code explicitly.

For this purpose we can introduce two additional hibernation operations,
called pre_restore() and restore_cleanup() and call them from the restore code
path.  Still, they should be called if the "platform" mode of hibernation has
been used, so we need to pass the information about the hibernation mode from
the "frozen" kernel to the "boot" kernel in the image header.

Apparently, we can't drop the disabling of GPEs before the restore because of
Bug #7887 .   We also can't do it unconditionally, because the GPEs wouldn't
have been enabled after a successful restore if the suspend had been done in
the 'shutdown' or 'reboot' mode.

In principle we could (and probably should) unconditionally disable the GPEs
before each snapshot creation *and* before the restore, but then we'd have to
unconditionally enable them after the snapshot creation as well as after the
restore (or restore failure)   Still, for this purpose we'd need to modify
acpi_enter_sleep_state_prep() and acpi_leave_sleep_state() and we'd have to
introduce some mechanism synchronizing the disablind/enabling of the GPEs with
the device drivers' .suspend()/.resume() routines and with
disable_/enable_nonboot_cpus().   However, this would have affected the
suspend (ie.  s2ram) code as well as the hibernation, which I'd like to avoid
in this patch series.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/acpi/sleep/main.c | 16 ++++++++++++++
 include/linux/suspend.h   |  4 ++++
 kernel/power/disk.c       | 56 ++++++++++++++++++++++++++++++++++++++---------
 kernel/power/power.h      | 13 ++++++++---
 kernel/power/swap.c       | 20 ++++++++++++-----
 kernel/power/user.c       |  2 +-
 6 files changed, 92 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c
index bc7e16ec8393..42127c0d612c 100644
--- a/drivers/acpi/sleep/main.c
+++ b/drivers/acpi/sleep/main.c
@@ -217,10 +217,26 @@ static void acpi_hibernation_finish(void)
 	}
 }
 
+static int acpi_hibernation_pre_restore(void)
+{
+	acpi_status status;
+
+	status = acpi_hw_disable_all_gpes();
+
+	return ACPI_SUCCESS(status) ? 0 : -EFAULT;
+}
+
+static void acpi_hibernation_restore_cleanup(void)
+{
+	acpi_hw_enable_all_runtime_gpes();
+}
+
 static struct hibernation_ops acpi_hibernation_ops = {
 	.prepare = acpi_hibernation_prepare,
 	.enter = acpi_hibernation_enter,
 	.finish = acpi_hibernation_finish,
+	.pre_restore = acpi_hibernation_pre_restore,
+	.restore_cleanup = acpi_hibernation_restore_cleanup,
 };
 #endif				/* CONFIG_SOFTWARE_SUSPEND */
 
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 9c7cb6430666..d235c146da2b 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -43,11 +43,15 @@ static inline void pm_restore_console(void) {}
  * @prepare: prepare system for hibernation
  * @enter: shut down system after state has been saved to disk
  * @finish: finish/clean up after state has been reloaded
+ * @pre_restore: prepare system for the restoration from a hibernation image
+ * @restore_cleanup: clean up after a failing image restoration
  */
 struct hibernation_ops {
 	int (*prepare)(void);
 	int (*enter)(void);
 	void (*finish)(void);
+	int (*pre_restore)(void);
+	void (*restore_cleanup)(void);
 };
 
 #if defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND)
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 47882bfa610e..fa3b43b7206d 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -54,7 +54,8 @@ static struct hibernation_ops *hibernation_ops;
 
 void hibernation_set_ops(struct hibernation_ops *ops)
 {
-	if (ops && !(ops->prepare && ops->enter && ops->finish)) {
+	if (ops && !(ops->prepare && ops->enter && ops->finish
+	    && ops->pre_restore && ops->restore_cleanup)) {
 		WARN_ON(1);
 		return;
 	}
@@ -91,6 +92,31 @@ static void platform_finish(int platform_mode)
 		hibernation_ops->finish();
 }
 
+/**
+ *	platform_pre_restore - prepare the platform for the restoration from a
+ *	hibernation image.  If the restore fails after this function has been
+ *	called, platform_restore_cleanup() must be called.
+ */
+
+static int platform_pre_restore(int platform_mode)
+{
+	return (platform_mode && hibernation_ops) ?
+		hibernation_ops->pre_restore() : 0;
+}
+
+/**
+ *	platform_restore_cleanup - switch the platform to the normal mode of
+ *	operation after a failing restore.  If platform_pre_restore() has been
+ *	called before the failing restore, this function must be called too,
+ *	regardless of the result of platform_pre_restore().
+ */
+
+static void platform_restore_cleanup(int platform_mode)
+{
+	if (platform_mode && hibernation_ops)
+		hibernation_ops->restore_cleanup();
+}
+
 /**
  *	hibernation_snapshot - quiesce devices and create the hibernation
  *	snapshot image.
@@ -141,11 +167,13 @@ int hibernation_snapshot(int platform_mode)
 /**
  *	hibernation_restore - quiesce devices and restore the hibernation
  *	snapshot image.  If successful, control returns in hibernation_snaphot()
+ *	@platform_mode - if set, use the platform driver, if available, to
+ *			 prepare the platform frimware for the transition.
  *
  *	Must be called with pm_mutex held
  */
 
-int hibernation_restore(void)
+int hibernation_restore(int platform_mode)
 {
 	int error;
 
@@ -155,11 +183,14 @@ int hibernation_restore(void)
 	if (error)
 		goto Finish;
 
-	error = disable_nonboot_cpus();
-	if (!error)
-		error = swsusp_resume();
-
-	enable_nonboot_cpus();
+	error = platform_pre_restore(platform_mode);
+	if (!error) {
+		error = disable_nonboot_cpus();
+		if (!error)
+			error = swsusp_resume();
+		enable_nonboot_cpus();
+	}
+	platform_restore_cleanup(platform_mode);
  Finish:
 	device_resume();
 	resume_console();
@@ -260,8 +291,12 @@ int hibernate(void)
 	}
 	error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
 	if (in_suspend && !error) {
+		unsigned int flags = 0;
+
+		if (hibernation_mode == HIBERNATION_PLATFORM)
+			flags |= SF_PLATFORM_MODE;
 		pr_debug("PM: writing image.\n");
-		error = swsusp_write();
+		error = swsusp_write(flags);
 		swsusp_free();
 		if (!error)
 			power_down();
@@ -295,6 +330,7 @@ int hibernate(void)
 static int software_resume(void)
 {
 	int error;
+	unsigned int flags;
 
 	mutex_lock(&pm_mutex);
 	if (!swsusp_resume_device) {
@@ -342,9 +378,9 @@ static int software_resume(void)
 
 	pr_debug("PM: Reading swsusp image.\n");
 
-	error = swsusp_read();
+	error = swsusp_read(&flags);
 	if (!error)
-		hibernation_restore();
+		hibernation_restore(flags & SF_PLATFORM_MODE);
 
 	printk(KERN_ERR "PM: Restore failed, recovering.\n");
 	swsusp_free();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 70c378b3f85a..eab3603b7caf 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -27,7 +27,7 @@ struct swsusp_info {
 
 /* kernel/power/disk.c */
 extern int hibernation_snapshot(int platform_mode);
-extern int hibernation_restore(void);
+extern int hibernation_restore(int platform_mode);
 extern int hibernation_platform_enter(void);
 #endif
 
@@ -155,13 +155,20 @@ extern sector_t alloc_swapdev_block(int swap);
 extern void free_all_swap_pages(int swap);
 extern int swsusp_swap_in_use(void);
 
+/*
+ * Flags that can be passed from the hibernatig hernel to the "boot" kernel in
+ * the image header.
+ */
+#define SF_PLATFORM_MODE	1
+
+/* kernel/power/disk.c */
 extern int swsusp_check(void);
 extern int swsusp_shrink_memory(void);
 extern void swsusp_free(void);
 extern int swsusp_suspend(void);
 extern int swsusp_resume(void);
-extern int swsusp_read(void);
-extern int swsusp_write(void);
+extern int swsusp_read(unsigned int *flags_p);
+extern int swsusp_write(unsigned int flags);
 extern void swsusp_close(void);
 extern int suspend_enter(suspend_state_t state);
 
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8b1a1b837145..917aba100575 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -33,8 +33,9 @@ extern char resume_file[];
 #define SWSUSP_SIG	"S1SUSPEND"
 
 struct swsusp_header {
-	char reserved[PAGE_SIZE - 20 - sizeof(sector_t)];
+	char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
 	sector_t image;
+	unsigned int flags;	/* Flags to pass to the "boot" kernel */
 	char	orig_sig[10];
 	char	sig[10];
 } __attribute__((packed));
@@ -138,7 +139,7 @@ static int wait_on_bio_chain(struct bio **bio_chain)
  * Saving part
  */
 
-static int mark_swapfiles(sector_t start)
+static int mark_swapfiles(sector_t start, unsigned int flags)
 {
 	int error;
 
@@ -148,6 +149,7 @@ static int mark_swapfiles(sector_t start)
 		memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
 		memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
 		swsusp_header->image = start;
+		swsusp_header->flags = flags;
 		error = bio_write_page(swsusp_resume_block,
 					swsusp_header, NULL);
 	} else {
@@ -369,6 +371,7 @@ static int enough_swap(unsigned int nr_pages)
 
 /**
  *	swsusp_write - Write entire image and metadata.
+ *	@flags: flags to pass to the "boot" kernel in the image header
  *
  *	It is important _NOT_ to umount filesystems at this point. We want
  *	them synced (in case something goes wrong) but we DO not want to mark
@@ -376,7 +379,7 @@ static int enough_swap(unsigned int nr_pages)
  *	correctly, we'll mark system clean, anyway.)
  */
 
-int swsusp_write(void)
+int swsusp_write(unsigned int flags)
 {
 	struct swap_map_handle handle;
 	struct snapshot_handle snapshot;
@@ -415,7 +418,7 @@ int swsusp_write(void)
 		if (!error) {
 			flush_swap_writer(&handle);
 			printk("S");
-			error = mark_swapfiles(start);
+			error = mark_swapfiles(start, flags);
 			printk("|\n");
 		}
 	}
@@ -540,13 +543,20 @@ static int load_image(struct swap_map_handle *handle,
 	return error;
 }
 
-int swsusp_read(void)
+/**
+ *	swsusp_read - read the hibernation image.
+ *	@flags_p: flags passed by the "frozen" kernel in the image header should
+ *		  be written into this memeory location
+ */
+
+int swsusp_read(unsigned int *flags_p)
 {
 	int error;
 	struct swap_map_handle handle;
 	struct snapshot_handle snapshot;
 	struct swsusp_info *header;
 
+	*flags_p = swsusp_header->flags;
 	if (IS_ERR(resume_bdev)) {
 		pr_debug("swsusp: block device not initialised\n");
 		return PTR_ERR(resume_bdev);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bfed3b924093..1f24f30b951b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -188,7 +188,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			error = -EPERM;
 			break;
 		}
-		error = hibernation_restore();
+		error = hibernation_restore(data->platform_suspend);
 		break;
 
 	case SNAPSHOT_FREE:
-- 
cgit v1.3-8-gc7d7


From 0c1eecfb345401629aa57c9d3b077273e56c45a7 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2007 01:47:33 -0700
Subject: Freezer: avoid freezing kernel threads prematurely

Kernel threads should not have TIF_FREEZE set when user space processes are
being frozen, since otherwise some of them might be frozen prematurely.
To prevent this from happening we can (1) make exit_mm() unset TIF_FREEZE
unconditionally just after clearing tsk->mm and (2) make try_to_freeze_tasks()
check if p->mm is different from zero and PF_BORROWED_MM is unset in p->flags
when user space processes are to be frozen.

Namely, when user space processes are being frozen, we only should set
TIF_FREEZE for tasks that have p->mm different from NULL and don't have
PF_BORROWED_MM set in p->flags.  For this reason task_lock() must be used to
prevent try_to_freeze_tasks() from racing with use_mm()/unuse_mm(), in which
p->mm and p->flags.PF_BORROWED_MM are changed under task_lock(p).  Also, we
need to prevent the following scenario from happening:

* daemonize() is called by a task spawned from a user space code path
* freezer checks if the task has p->mm set and the result is positive
* task enters exit_mm() and clears its TIF_FREEZE
* freezer sets TIF_FREEZE for the task
* task calls try_to_freeze() and goes to the refrigerator, which is wrong at
  that point

This requires us to acquire task_lock(p) before p->flags.PF_BORROWED_MM and
p->mm are examined and release it after TIF_FREEZE is set for p (or it turns
out that TIF_FREEZE should not be set).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Gautham R Shenoy <ego@in.ibm.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/freezer.h |  9 +++----
 kernel/exit.c           |  3 +++
 kernel/power/process.c  | 64 +++++++++++++++++++++++++------------------------
 3 files changed, 41 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 2d38b1a74662..c8e02de737f6 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -25,7 +25,7 @@ static inline int freezing(struct task_struct *p)
 /*
  * Request that a process be frozen
  */
-static inline void freeze(struct task_struct *p)
+static inline void set_freeze_flag(struct task_struct *p)
 {
 	set_tsk_thread_flag(p, TIF_FREEZE);
 }
@@ -33,7 +33,7 @@ static inline void freeze(struct task_struct *p)
 /*
  * Sometimes we may need to cancel the previous 'freeze' request
  */
-static inline void do_not_freeze(struct task_struct *p)
+static inline void clear_freeze_flag(struct task_struct *p)
 {
 	clear_tsk_thread_flag(p, TIF_FREEZE);
 }
@@ -56,7 +56,7 @@ static inline int thaw_process(struct task_struct *p)
 		wake_up_process(p);
 		return 1;
 	}
-	clear_tsk_thread_flag(p, TIF_FREEZE);
+	clear_freeze_flag(p);
 	task_unlock(p);
 	return 0;
 }
@@ -129,7 +129,8 @@ static inline void set_freezable(void)
 #else
 static inline int frozen(struct task_struct *p) { return 0; }
 static inline int freezing(struct task_struct *p) { return 0; }
-static inline void freeze(struct task_struct *p) { BUG(); }
+static inline void set_freeze_flag(struct task_struct *p) {}
+static inline void clear_freeze_flag(struct task_struct *p) {}
 static inline int thaw_process(struct task_struct *p) { return 1; }
 
 static inline void refrigerator(void) {}
diff --git a/kernel/exit.c b/kernel/exit.c
index e8af8d0c2483..464c2b172f07 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -45,6 +45,7 @@
 #include <linux/resource.h>
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/freezer.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -594,6 +595,8 @@ static void exit_mm(struct task_struct * tsk)
 	tsk->mm = NULL;
 	up_read(&mm->mmap_sem);
 	enter_lazy_tlb(mm, current);
+	/* We don't want this task to be frozen prematurely */
+	clear_freeze_flag(tsk);
 	task_unlock(tsk);
 	mmput(mm);
 }
diff --git a/kernel/power/process.c b/kernel/power/process.c
index b850173e7561..e1bcdedd1464 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -40,7 +40,7 @@ static inline void frozen_process(void)
 		current->flags |= PF_FROZEN;
 		wmb();
 	}
-	clear_tsk_thread_flag(current, TIF_FREEZE);
+	clear_freeze_flag(current);
 }
 
 /* Refrigerator is place where frozen processes are stored :-). */
@@ -75,17 +75,16 @@ void refrigerator(void)
 	current->state = save;
 }
 
-static inline void freeze_process(struct task_struct *p)
+static void freeze_task(struct task_struct *p)
 {
 	unsigned long flags;
 
 	if (!freezing(p)) {
 		rmb();
 		if (!frozen(p)) {
+			set_freeze_flag(p);
 			if (p->state == TASK_STOPPED)
 				force_sig_specific(SIGSTOP, p);
-
-			freeze(p);
 			spin_lock_irqsave(&p->sighand->siglock, flags);
 			signal_wake_up(p, p->state == TASK_STOPPED);
 			spin_unlock_irqrestore(&p->sighand->siglock, flags);
@@ -99,18 +98,13 @@ static void cancel_freezing(struct task_struct *p)
 
 	if (freezing(p)) {
 		pr_debug("  clean up: %s\n", p->comm);
-		do_not_freeze(p);
+		clear_freeze_flag(p);
 		spin_lock_irqsave(&p->sighand->siglock, flags);
 		recalc_sigpending_and_wake(p);
 		spin_unlock_irqrestore(&p->sighand->siglock, flags);
 	}
 }
 
-static inline int is_user_space(struct task_struct *p)
-{
-	return p->mm && !(p->flags & PF_BORROWED_MM);
-}
-
 static unsigned int try_to_freeze_tasks(int freeze_user_space)
 {
 	struct task_struct *g, *p;
@@ -122,20 +116,34 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space)
 		todo = 0;
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
-			if (!freezeable(p))
-				continue;
-
-			if (frozen(p))
+			if (frozen(p) || !freezeable(p))
 				continue;
 
-			if (p->state == TASK_TRACED && frozen(p->parent)) {
-				cancel_freezing(p);
-				continue;
+			if (freeze_user_space) {
+				if (p->state == TASK_TRACED &&
+				    frozen(p->parent)) {
+					cancel_freezing(p);
+					continue;
+				}
+				/*
+				 * Kernel threads should not have TIF_FREEZE set
+				 * at this point, so we must ensure that either
+				 * p->mm is not NULL *and* PF_BORROWED_MM is
+				 * unset, or TIF_FRREZE is left unset.
+				 * The task_lock() is necessary to prevent races
+				 * with exit_mm() or use_mm()/unuse_mm() from
+				 * occuring.
+				 */
+				task_lock(p);
+				if (!p->mm || (p->flags & PF_BORROWED_MM)) {
+					task_unlock(p);
+					continue;
+				}
+				freeze_task(p);
+				task_unlock(p);
+			} else {
+				freeze_task(p);
 			}
-			if (freeze_user_space && !is_user_space(p))
-				continue;
-
-			freeze_process(p);
 			if (!freezer_should_skip(p))
 				todo++;
 		} while_each_thread(g, p);
@@ -152,22 +160,16 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space)
 		 * but it cleans up leftover PF_FREEZE requests.
 		 */
 		printk("\n");
-		printk(KERN_ERR "Stopping %s timed out after %d seconds "
+		printk(KERN_ERR "Freezing of %s timed out after %d seconds "
 				"(%d tasks refusing to freeze):\n",
-				freeze_user_space ? "user space processes" :
-					"kernel threads",
+				freeze_user_space ? "user space " : "tasks ",
 				TIMEOUT / HZ, todo);
 		show_state();
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
-			if (freeze_user_space && !is_user_space(p))
-				continue;
-
 			task_lock(p);
-			if (freezeable(p) && !frozen(p) &&
-			    !freezer_should_skip(p))
+			if (freezing(p) && !freezer_should_skip(p))
 				printk(KERN_ERR " %s\n", p->comm);
-
 			cancel_freezing(p);
 			task_unlock(p);
 		} while_each_thread(g, p);
@@ -211,7 +213,7 @@ static void thaw_tasks(int thaw_user_space)
 		if (!freezeable(p))
 			continue;
 
-		if (is_user_space(p) == !thaw_user_space)
+		if (!p->mm == thaw_user_space)
 			continue;
 
 		thaw_process(p);
-- 
cgit v1.3-8-gc7d7


From b10d911749d37dccfa5873d2088aea3f074b9e45 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2007 01:47:36 -0700
Subject: PM: introduce hibernation and suspend notifiers

Make it possible to register hibernation and suspend notifiers, so that
subsystems can perform hibernation-related or suspend-related operations that
should not be carried out by device drivers' .suspend() and .resume()
routines.

[akpm@linux-foundation.org: build fixes]
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/power/notifiers.txt | 50 +++++++++++++++++++++++++++++++++++++++
 include/linux/notifier.h          |  6 +++++
 include/linux/suspend.h           | 48 +++++++++++++++++++++++++++++++++----
 kernel/power/disk.c               | 16 +++++++++----
 kernel/power/main.c               |  9 +++++++
 kernel/power/power.h              | 10 ++++++++
 kernel/power/user.c               | 11 ++++++---
 7 files changed, 138 insertions(+), 12 deletions(-)
 create mode 100644 Documentation/power/notifiers.txt

(limited to 'include/linux')

diff --git a/Documentation/power/notifiers.txt b/Documentation/power/notifiers.txt
new file mode 100644
index 000000000000..9293e4bc857c
--- /dev/null
+++ b/Documentation/power/notifiers.txt
@@ -0,0 +1,50 @@
+Suspend notifiers
+	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+There are some operations that device drivers may want to carry out in their
+.suspend() routines, but shouldn't, because they can cause the hibernation or
+suspend to fail. For example, a driver may want to allocate a substantial amount
+of memory (like 50 MB) in .suspend(), but that shouldn't be done after the
+swsusp's memory shrinker has run.
+
+Also, there may be some operations, that subsystems want to carry out before a
+hibernation/suspend or after a restore/resume, requiring the system to be fully
+functional, so the drivers' .suspend() and .resume() routines are not suitable
+for this purpose.  For example, device drivers may want to upload firmware to
+their devices after a restore from a hibernation image, but they cannot do it by
+calling request_firmware() from their .resume() routines (user land processes
+are frozen at this point).  The solution may be to load the firmware into
+memory before processes are frozen and upload it from there in the .resume()
+routine.  Of course, a hibernation notifier may be used for this purpose.
+
+The subsystems that have such needs can register suspend notifiers that will be
+called upon the following events by the suspend core:
+
+PM_HIBERNATION_PREPARE	The system is going to hibernate or suspend, tasks will
+			be frozen immediately.
+
+PM_POST_HIBERNATION	The system memory state has been restored from a
+			hibernation image or an error occured during the
+			hibernation.  Device drivers' .resume() callbacks have
+			been executed and tasks have been thawed.
+
+PM_SUSPEND_PREPARE	The system is preparing for a suspend.
+
+PM_POST_SUSPEND		The system has just resumed or an error occured during
+			the suspend.	Device drivers' .resume() callbacks have
+			been executed and tasks have been thawed.
+
+It is generally assumed that whatever the notifiers do for
+PM_HIBERNATION_PREPARE, should be undone for PM_POST_HIBERNATION.  Analogously,
+operations performed for PM_SUSPEND_PREPARE should be reversed for
+PM_POST_SUSPEND.  Additionally, all of the notifiers are called for
+PM_POST_HIBERNATION if one of them fails for PM_HIBERNATION_PREPARE, and
+all of the notifiers are called for PM_POST_SUSPEND if one of them fails for
+PM_SUSPEND_PREPARE.
+
+The hibernation and suspend notifiers are called with pm_mutex held.  They are
+defined in the usual way, but their last argument is meaningless (it is always
+NULL).  To register and/or unregister a suspend notifier use the functions
+register_pm_notifier() and unregister_pm_notifier(), respectively, defined in
+include/linux/suspend.h .  If you don't need to unregister the notifier, you can
+also use the pm_notifier() macro defined in include/linux/suspend.h .
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 576f2bb34cc8..be3f2bb6fcf3 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -212,5 +212,11 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
 #define CPU_DEAD_FROZEN		(CPU_DEAD | CPU_TASKS_FROZEN)
 #define CPU_DYING_FROZEN	(CPU_DYING | CPU_TASKS_FROZEN)
 
+/* Hibernation and suspend events */
+#define PM_HIBERNATION_PREPARE	0x0001 /* Going to hibernate */
+#define PM_POST_HIBERNATION	0x0002 /* Hibernation finished */
+#define PM_SUSPEND_PREPARE	0x0003 /* Going to suspend the system */
+#define PM_POST_SUSPEND		0x0004 /* Suspend finished */
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_NOTIFIER_H */
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index d235c146da2b..e8e6da394c92 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -54,7 +54,8 @@ struct hibernation_ops {
 	void (*restore_cleanup)(void);
 };
 
-#if defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND)
+#ifdef CONFIG_PM
+#ifdef CONFIG_SOFTWARE_SUSPEND
 /* kernel/power/snapshot.c */
 extern void __register_nosave_region(unsigned long b, unsigned long e, int km);
 static inline void register_nosave_region(unsigned long b, unsigned long e)
@@ -72,16 +73,14 @@ extern unsigned long get_safe_page(gfp_t gfp_mask);
 
 extern void hibernation_set_ops(struct hibernation_ops *ops);
 extern int hibernate(void);
-#else
-static inline void register_nosave_region(unsigned long b, unsigned long e) {}
-static inline void register_nosave_region_late(unsigned long b, unsigned long e) {}
+#else /* CONFIG_SOFTWARE_SUSPEND */
 static inline int swsusp_page_is_forbidden(struct page *p) { return 0; }
 static inline void swsusp_set_page_free(struct page *p) {}
 static inline void swsusp_unset_page_free(struct page *p) {}
 
 static inline void hibernation_set_ops(struct hibernation_ops *ops) {}
 static inline int hibernate(void) { return -ENOSYS; }
-#endif /* defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND) */
+#endif /* CONFIG_SOFTWARE_SUSPEND */
 
 void save_processor_state(void);
 void restore_processor_state(void);
@@ -89,4 +88,43 @@ struct saved_context;
 void __save_processor_state(struct saved_context *ctxt);
 void __restore_processor_state(struct saved_context *ctxt);
 
+/* kernel/power/main.c */
+extern struct blocking_notifier_head pm_chain_head;
+
+static inline int register_pm_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&pm_chain_head, nb);
+}
+
+static inline int unregister_pm_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&pm_chain_head, nb);
+}
+
+#define pm_notifier(fn, pri) {				\
+	static struct notifier_block fn##_nb =			\
+		{ .notifier_call = fn, .priority = pri };	\
+	register_pm_notifier(&fn##_nb);			\
+}
+#else /* CONFIG_PM */
+
+static inline int register_pm_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+
+static inline int unregister_pm_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+
+#define pm_notifier(fn, pri)	do { (void)(fn); } while (0)
+#endif /* CONFIG_PM */
+
+#if !defined CONFIG_SOFTWARE_SUSPEND || !defined(CONFIG_PM)
+static inline void register_nosave_region(unsigned long b, unsigned long e)
+{
+}
+#endif
+
 #endif /* _LINUX_SWSUSP_H */
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 885c653509c9..324ac0188ce1 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -281,9 +281,16 @@ int hibernate(void)
 {
 	int error;
 
+	mutex_lock(&pm_mutex);
 	/* The snapshot device should not be opened while we're running */
-	if (!atomic_add_unless(&snapshot_device_available, -1, 0))
-		return -EBUSY;
+	if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+		error = -EBUSY;
+		goto Unlock;
+	}
+
+	error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+	if (error)
+		goto Exit;
 
 	/* Allocate memory management structures */
 	error = create_basic_memory_bitmaps();
@@ -294,7 +301,6 @@ int hibernate(void)
 	if (error)
 		goto Finish;
 
-	mutex_lock(&pm_mutex);
 	if (hibernation_mode == HIBERNATION_TESTPROC) {
 		printk("swsusp debug: Waiting for 5 seconds.\n");
 		mdelay(5000);
@@ -316,12 +322,14 @@ int hibernate(void)
 		swsusp_free();
 	}
  Thaw:
-	mutex_unlock(&pm_mutex);
 	unprepare_processes();
  Finish:
 	free_basic_memory_bitmaps();
  Exit:
+	pm_notifier_call_chain(PM_POST_HIBERNATION);
 	atomic_inc(&snapshot_device_available);
+ Unlock:
+	mutex_unlock(&pm_mutex);
 	return error;
 }
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index fc45ed22620f..4d26ad394fb3 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -23,6 +23,8 @@
 
 #include "power.h"
 
+BLOCKING_NOTIFIER_HEAD(pm_chain_head);
+
 /*This is just an arbitrary number */
 #define FREE_PAGE_NUMBER (100)
 
@@ -78,6 +80,10 @@ static int suspend_prepare(suspend_state_t state)
 	if (!pm_ops || !pm_ops->enter)
 		return -EPERM;
 
+	error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
+	if (error)
+		goto Finish;
+
 	pm_prepare_console();
 
 	if (freeze_processes()) {
@@ -125,6 +131,8 @@ static int suspend_prepare(suspend_state_t state)
  Thaw:
 	thaw_processes();
 	pm_restore_console();
+ Finish:
+	pm_notifier_call_chain(PM_POST_SUSPEND);
 	return error;
 }
 
@@ -176,6 +184,7 @@ static void suspend_finish(suspend_state_t state)
 	resume_console();
 	thaw_processes();
 	pm_restore_console();
+	pm_notifier_call_chain(PM_POST_SUSPEND);
 }
 
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index eab3603b7caf..01c2275b15b2 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -173,5 +173,15 @@ extern void swsusp_close(void);
 extern int suspend_enter(suspend_state_t state);
 
 struct timeval;
+/* kernel/power/swsusp.c */
 extern void swsusp_show_speed(struct timeval *, struct timeval *,
 				unsigned int, char *);
+
+/* kernel/power/main.c */
+extern struct blocking_notifier_head pm_chain_head;
+
+static inline int pm_notifier_call_chain(unsigned long val)
+{
+	return (blocking_notifier_call_chain(&pm_chain_head, val, NULL)
+			== NOTIFY_BAD) ? -EINVAL : 0;
+}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 1f24f30b951b..7f19afe01b48 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -151,10 +151,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		if (data->frozen)
 			break;
 		mutex_lock(&pm_mutex);
-		if (freeze_processes()) {
-			thaw_processes();
-			error = -EBUSY;
+		error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+		if (!error) {
+			error = freeze_processes();
+			if (error)
+				thaw_processes();
 		}
+		if (error)
+			pm_notifier_call_chain(PM_POST_HIBERNATION);
 		mutex_unlock(&pm_mutex);
 		if (!error)
 			data->frozen = 1;
@@ -165,6 +169,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			break;
 		mutex_lock(&pm_mutex);
 		thaw_processes();
+		pm_notifier_call_chain(PM_POST_HIBERNATION);
 		mutex_unlock(&pm_mutex);
 		data->frozen = 0;
 		break;
-- 
cgit v1.3-8-gc7d7


From bd804eba1c8597cbb7cd5a5f9fe886aae16a079a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 19 Jul 2007 01:47:40 -0700
Subject: PM: Introduce pm_power_off_prepare

Introduce the pm_power_off_prepare() callback that can be registered by the
interested platforms in analogy with pm_idle() and pm_power_off(), used for
preparing the system to power off (needed by ACPI).

This allows us to drop acpi_sysclass and device_acpi that are only defined in
order to register the ACPI power off preparation callback, which is needed by
pm_power_off() registered in a much different way.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/acpi/sleep/poweroff.c | 38 +++++++++-----------------------------
 include/linux/pm.h            |  1 +
 kernel/sys.c                  |  9 +++++++++
 3 files changed, 19 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/sleep/poweroff.c b/drivers/acpi/sleep/poweroff.c
index 240dde9cf13a..39e40d56b034 100644
--- a/drivers/acpi/sleep/poweroff.c
+++ b/drivers/acpi/sleep/poweroff.c
@@ -39,7 +39,13 @@ int acpi_sleep_prepare(u32 acpi_state)
 
 #ifdef CONFIG_PM
 
-void acpi_power_off(void)
+static void acpi_power_off_prepare(void)
+{
+	/* Prepare to power off the system */
+	acpi_sleep_prepare(ACPI_STATE_S5);
+}
+
+static void acpi_power_off(void)
 {
 	/* acpi_sleep_prepare(ACPI_STATE_S5) should have already been called */
 	printk("%s called\n", __FUNCTION__);
@@ -48,27 +54,6 @@ void acpi_power_off(void)
 	acpi_enter_sleep_state(ACPI_STATE_S5);
 }
 
-static int acpi_shutdown(struct sys_device *x)
-{
-	switch (system_state) {
-	case SYSTEM_POWER_OFF:
-		/* Prepare to power off the system */
-		return acpi_sleep_prepare(ACPI_STATE_S5);
-	default:
-		return 0;
-	}
-}
-
-static struct sysdev_class acpi_sysclass = {
-	set_kset_name("acpi"),
-	.shutdown = acpi_shutdown
-};
-
-static struct sys_device device_acpi = {
-	.id = 0,
-	.cls = &acpi_sysclass,
-};
-
 static int acpi_poweroff_init(void)
 {
 	if (!acpi_disabled) {
@@ -78,13 +63,8 @@ static int acpi_poweroff_init(void)
 		status =
 		    acpi_get_sleep_type_data(ACPI_STATE_S5, &type_a, &type_b);
 		if (ACPI_SUCCESS(status)) {
-			int error;
-			error = sysdev_class_register(&acpi_sysclass);
-			if (!error)
-				error = sysdev_register(&device_acpi);
-			if (!error)
-				pm_power_off = acpi_power_off;
-			return error;
+			pm_power_off_prepare = acpi_power_off_prepare;
+			pm_power_off = acpi_power_off;
 		}
 	}
 	return 0;
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 2735b7cadd20..ad3cc2eb0d34 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -101,6 +101,7 @@ struct pm_dev
  */
 extern void (*pm_idle)(void);
 extern void (*pm_power_off)(void);
+extern void (*pm_power_off_prepare)(void);
 
 typedef int __bitwise suspend_state_t;
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 18987c7f6add..d40e40a9446c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -99,6 +99,13 @@ int C_A_D = 1;
 struct pid *cad_pid;
 EXPORT_SYMBOL(cad_pid);
 
+/*
+ * If set, this is used for preparing the system to power off.
+ */
+
+void (*pm_power_off_prepare)(void);
+EXPORT_SYMBOL(pm_power_off_prepare);
+
 /*
  *	Notifier list for kernel code which wants to be called
  *	at shutdown. This is used to stop any idling DMA operations
@@ -867,6 +874,8 @@ EXPORT_SYMBOL_GPL(kernel_halt);
 void kernel_power_off(void)
 {
 	kernel_shutdown_prepare(SYSTEM_POWER_OFF);
+	if (pm_power_off_prepare)
+		pm_power_off_prepare();
 	printk(KERN_EMERG "Power down.\n");
 	machine_power_off();
 }
-- 
cgit v1.3-8-gc7d7


From 5a60d6235c8352ade8f2699e72fcdfe853730456 Mon Sep 17 00:00:00 2001
From: Nigel Cunningham <nigel@nigel.suspend2.net>
Date: Thu, 19 Jul 2007 01:47:41 -0700
Subject: PM: Optional beeping during resume from suspend to RAM

Add a feature allowing the user to make the system beep during a resume from
suspend to RAM, on x86_64 and i386.

This is useful for the users with broken resume from RAM, so that they can
verify if the control reaches the kernel after a wake-up event.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/kernel/acpi/wakeup.S   | 27 +++++++++++++++++++++++++++
 arch/x86_64/kernel/acpi/wakeup.S | 25 +++++++++++++++++++++++++
 include/linux/acpi.h             |  1 +
 kernel/power/main.c              | 23 +++++++++++++++++++++++
 4 files changed, 76 insertions(+)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/acpi/wakeup.S b/arch/i386/kernel/acpi/wakeup.S
index a2295a34b2c7..b4e2ec3c3928 100644
--- a/arch/i386/kernel/acpi/wakeup.S
+++ b/arch/i386/kernel/acpi/wakeup.S
@@ -13,6 +13,21 @@
 # cs = 0x1234, eip = 0x05
 # 
 
+#define BEEP \
+	inb	$97, %al; 	\
+	outb	%al, $0x80; 	\
+	movb	$3, %al; 	\
+	outb	%al, $97; 	\
+	outb	%al, $0x80; 	\
+	movb	$-74, %al; 	\
+	outb	%al, $67; 	\
+	outb	%al, $0x80; 	\
+	movb	$-119, %al; 	\
+	outb	%al, $66; 	\
+	outb	%al, $0x80; 	\
+	movb	$15, %al; 	\
+	outb	%al, $66;
+
 ALIGN
 	.align	4096
 ENTRY(wakeup_start)
@@ -31,6 +46,11 @@ wakeup_code:
 	movw	%cs, %ax
 	movw	%ax, %ds					# Make ds:0 point to wakeup_start
 	movw	%ax, %ss
+
+	testl   $1, beep_flags - wakeup_code
+	jz      1f
+	BEEP
+1:
 	mov	$(wakeup_stack - wakeup_code), %sp		# Private stack is needed for ASUS board
 	movw	$0x0e00 + 'S', %fs:(0x12)
 
@@ -88,6 +108,10 @@ wakeup_code:
 	cmpl	$0x12345678, %eax
 	jne	bogus_real_magic
 
+	testl   $2, beep_flags - wakeup_code
+	jz      1f
+	BEEP
+1:
 	ljmpl	$__KERNEL_CS,$wakeup_pmode_return
 
 real_save_gdt:	.word 0
@@ -98,6 +122,7 @@ real_save_cr4:	.long 0
 real_magic:	.long 0
 video_mode:	.long 0
 video_flags:	.long 0
+beep_flags:	.long 0
 real_efer_save_restore:	.long 0
 real_save_efer_edx: 	.long 0
 real_save_efer_eax: 	.long 0
@@ -262,6 +287,8 @@ ENTRY(acpi_copy_wakeup_routine)
 	movl	%edx, video_mode - wakeup_start (%eax)
 	movl	acpi_video_flags, %edx
 	movl	%edx, video_flags - wakeup_start (%eax)
+	movl	s2ram_beep, %edx
+	movl	%edx, beep_flags - wakeup_start (%eax)
 	movl	$0x12345678, real_magic - wakeup_start (%eax)
 	movl	$0x12345678, saved_magic
 	popl	%ebx
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
index 8550a6ffa275..ed63d1845792 100644
--- a/arch/x86_64/kernel/acpi/wakeup.S
+++ b/arch/x86_64/kernel/acpi/wakeup.S
@@ -16,6 +16,21 @@
 # cs = 0x1234, eip = 0x05
 #
 
+#define BEEP \
+	inb	$97, %al; 	\
+	outb	%al, $0x80; 	\
+	movb	$3, %al; 	\
+	outb	%al, $97; 	\
+	outb	%al, $0x80; 	\
+	movb	$-74, %al; 	\
+	outb	%al, $67; 	\
+	outb	%al, $0x80; 	\
+	movb	$-119, %al; 	\
+	outb	%al, $66; 	\
+	outb	%al, $0x80; 	\
+	movb	$15, %al; 	\
+	outb	%al, $66;
+
 
 ALIGN
 	.align	16
@@ -33,6 +48,13 @@ wakeup_code:
 	movw	%cs, %ax
 	movw	%ax, %ds		# Make ds:0 point to wakeup_start
 	movw	%ax, %ss
+
+	# Data segment must be set up before we can see whether to beep.
+	testl   $1, beep_flags - wakeup_code
+	jz      1f
+	BEEP
+1:
+
 					# Private stack is needed for ASUS board
 	mov	$(wakeup_stack - wakeup_code), %sp
 
@@ -229,6 +251,7 @@ gdt_48a:
 	.long   gdta - wakeup_code              # gdt base (relocated in later)
 	
 real_magic:	.quad 0
+beep_flags:	.quad 0
 video_mode:	.quad 0
 video_flags:	.quad 0
 
@@ -344,6 +367,8 @@ ENTRY(acpi_copy_wakeup_routine)
 	pushq	%rax
 	pushq	%rdx
 
+	movl	s2ram_beep, %edx
+	movl	%edx, beep_flags - wakeup_start (,%rdi)
 	movl	saved_video_mode, %edx
 	movl	%edx, video_mode - wakeup_start (,%rdi)
 	movl	acpi_video_flags, %edx
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index fccd8b548d93..c0ccdd720363 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -123,6 +123,7 @@ extern int pci_mmcfg_config_num;
 
 extern int sbf_port;
 extern unsigned long acpi_video_flags;
+extern unsigned long s2ram_beep;
 
 #else	/* !CONFIG_ACPI */
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 32147b57c3bf..c74a56436d8b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -332,6 +332,27 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
 
 power_attr(state);
 
+unsigned long s2ram_beep = 0;
+
+static ssize_t s2ram_beep_show(struct kset *kset, char *buf)
+{
+	return sprintf(buf, "%d\n", s2ram_beep);
+}
+
+static ssize_t
+s2ram_beep_store(struct kset *kset, const char *buf, size_t n)
+{
+	int val;
+
+	if (sscanf(buf, "%d", &val) > 0) {
+		s2ram_beep = val;
+		return n;
+	}
+	return -EINVAL;
+}
+
+power_attr(s2ram_beep);
+
 #ifdef CONFIG_PM_TRACE
 int pm_trace_enabled;
 
@@ -357,11 +378,13 @@ power_attr(pm_trace);
 static struct attribute * g[] = {
 	&state_attr.attr,
 	&pm_trace_attr.attr,
+	&s2ram_beep_attr.attr,
 	NULL,
 };
 #else
 static struct attribute * g[] = {
 	&state_attr.attr,
+	&s2ram_beep_attr.attr,
 	NULL,
 };
 #endif /* CONFIG_PM_TRACE */
-- 
cgit v1.3-8-gc7d7


From 77afcf78a2ded9a91838734234949c0ead5feb12 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Thu, 19 Jul 2007 01:47:41 -0700
Subject: PM: Integrate beeping flag with existing acpi_sleep flags

Move "debug during resume from s2ram" into the variable we already use
for real-mode flags to simplify code. It also closes nasty trap for
the user in acpi_sleep_setup; order of parameters actually mattered there,
acpi_sleep=s3_bios,s3_mode doing something different from
acpi_sleep=s3_mode,s3_bios.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/kernel/acpi/sleep.c    | 12 ++++++++----
 arch/i386/kernel/acpi/wakeup.S   | 18 ++++++++----------
 arch/x86_64/kernel/acpi/sleep.c  |  8 +++++---
 arch/x86_64/kernel/acpi/wakeup.S | 15 ++++++---------
 include/linux/acpi.h             |  3 +--
 kernel/power/main.c              | 23 -----------------------
 kernel/sysctl.c                  |  2 +-
 7 files changed, 29 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/acpi/sleep.c b/arch/i386/kernel/acpi/sleep.c
index 4ee83577bf61..c42b5ab49deb 100644
--- a/arch/i386/kernel/acpi/sleep.c
+++ b/arch/i386/kernel/acpi/sleep.c
@@ -14,7 +14,7 @@
 
 /* address in low memory of the wakeup routine. */
 unsigned long acpi_wakeup_address = 0;
-unsigned long acpi_video_flags;
+unsigned long acpi_realmode_flags;
 extern char wakeup_start, wakeup_end;
 
 extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
@@ -68,9 +68,11 @@ static int __init acpi_sleep_setup(char *str)
 {
 	while ((str != NULL) && (*str != '\0')) {
 		if (strncmp(str, "s3_bios", 7) == 0)
-			acpi_video_flags = 1;
+			acpi_realmode_flags |= 1;
 		if (strncmp(str, "s3_mode", 7) == 0)
-			acpi_video_flags |= 2;
+			acpi_realmode_flags |= 2;
+		if (strncmp(str, "s3_beep", 7) == 0)
+			acpi_realmode_flags |= 4;
 		str = strchr(str, ',');
 		if (str != NULL)
 			str += strspn(str, ", \t");
@@ -80,9 +82,11 @@ static int __init acpi_sleep_setup(char *str)
 
 __setup("acpi_sleep=", acpi_sleep_setup);
 
+/* Ouch, we want to delete this. We already have better version in userspace, in
+   s2ram from suspend.sf.net project */
 static __init int reset_videomode_after_s3(struct dmi_system_id *d)
 {
-	acpi_video_flags |= 2;
+	acpi_realmode_flags |= 2;
 	return 0;
 }
 
diff --git a/arch/i386/kernel/acpi/wakeup.S b/arch/i386/kernel/acpi/wakeup.S
index b4e2ec3c3928..ed0a0f2c1597 100644
--- a/arch/i386/kernel/acpi/wakeup.S
+++ b/arch/i386/kernel/acpi/wakeup.S
@@ -47,7 +47,7 @@ wakeup_code:
 	movw	%ax, %ds					# Make ds:0 point to wakeup_start
 	movw	%ax, %ss
 
-	testl   $1, beep_flags - wakeup_code
+	testl   $4, realmode_flags - wakeup_code
 	jz      1f
 	BEEP
 1:
@@ -61,7 +61,7 @@ wakeup_code:
 	cmpl	$0x12345678, %eax
 	jne	bogus_real_magic
 
-	testl	$1, video_flags - wakeup_code
+	testl	$1, realmode_flags - wakeup_code
 	jz	1f
 	lcall   $0xc000,$3
 	movw	%cs, %ax
@@ -69,7 +69,7 @@ wakeup_code:
 	movw	%ax, %ss
 1:
 
-	testl	$2, video_flags - wakeup_code
+	testl	$2, realmode_flags - wakeup_code
 	jz	1f
 	mov	video_mode - wakeup_code, %ax
 	call	mode_set
@@ -108,11 +108,11 @@ wakeup_code:
 	cmpl	$0x12345678, %eax
 	jne	bogus_real_magic
 
-	testl   $2, beep_flags - wakeup_code
+	testl   $8, realmode_flags - wakeup_code
 	jz      1f
 	BEEP
 1:
-	ljmpl	$__KERNEL_CS,$wakeup_pmode_return
+	ljmpl	$__KERNEL_CS, $wakeup_pmode_return
 
 real_save_gdt:	.word 0
 		.long 0
@@ -121,7 +121,7 @@ real_save_cr3:	.long 0
 real_save_cr4:	.long 0
 real_magic:	.long 0
 video_mode:	.long 0
-video_flags:	.long 0
+realmode_flags:	.long 0
 beep_flags:	.long 0
 real_efer_save_restore:	.long 0
 real_save_efer_edx: 	.long 0
@@ -285,10 +285,8 @@ ENTRY(acpi_copy_wakeup_routine)
 
 	movl	saved_videomode, %edx
 	movl	%edx, video_mode - wakeup_start (%eax)
-	movl	acpi_video_flags, %edx
-	movl	%edx, video_flags - wakeup_start (%eax)
-	movl	s2ram_beep, %edx
-	movl	%edx, beep_flags - wakeup_start (%eax)
+	movl	acpi_realmode_flags, %edx
+	movl	%edx, realmode_flags - wakeup_start (%eax)
 	movl	$0x12345678, real_magic - wakeup_start (%eax)
 	movl	$0x12345678, saved_magic
 	popl	%ebx
diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c
index 195b7034a148..4277f2b27e6d 100644
--- a/arch/x86_64/kernel/acpi/sleep.c
+++ b/arch/x86_64/kernel/acpi/sleep.c
@@ -55,7 +55,7 @@
 
 /* address in low memory of the wakeup routine. */
 unsigned long acpi_wakeup_address = 0;
-unsigned long acpi_video_flags;
+unsigned long acpi_realmode_flags;
 extern char wakeup_start, wakeup_end;
 
 extern unsigned long acpi_copy_wakeup_routine(unsigned long);
@@ -103,9 +103,11 @@ static int __init acpi_sleep_setup(char *str)
 {
 	while ((str != NULL) && (*str != '\0')) {
 		if (strncmp(str, "s3_bios", 7) == 0)
-			acpi_video_flags = 1;
+			acpi_realmode_flags |= 1;
 		if (strncmp(str, "s3_mode", 7) == 0)
-			acpi_video_flags |= 2;
+			acpi_realmode_flags |= 2;
+		if (strncmp(str, "s3_beep", 7) == 0)
+			acpi_realmode_flags |= 4;
 		str = strchr(str, ',');
 		if (str != NULL)
 			str += strspn(str, ", \t");
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
index ed63d1845792..13f1480cbec9 100644
--- a/arch/x86_64/kernel/acpi/wakeup.S
+++ b/arch/x86_64/kernel/acpi/wakeup.S
@@ -50,7 +50,7 @@ wakeup_code:
 	movw	%ax, %ss
 
 	# Data segment must be set up before we can see whether to beep.
-	testl   $1, beep_flags - wakeup_code
+	testl   $4, realmode_flags - wakeup_code
 	jz      1f
 	BEEP
 1:
@@ -70,7 +70,7 @@ wakeup_code:
 	testl	%eax, %eax
 	jnz	no_longmode
 
-	testl	$1, video_flags - wakeup_code
+	testl	$1, realmode_flags - wakeup_code
 	jz	1f
 	lcall   $0xc000,$3
 	movw	%cs, %ax
@@ -78,7 +78,7 @@ wakeup_code:
 	movw	%ax, %ss
 1:
 
-	testl	$2, video_flags - wakeup_code
+	testl	$2, realmode_flags - wakeup_code
 	jz	1f
 	mov	video_mode - wakeup_code, %ax
 	call	mode_seta
@@ -251,9 +251,8 @@ gdt_48a:
 	.long   gdta - wakeup_code              # gdt base (relocated in later)
 	
 real_magic:	.quad 0
-beep_flags:	.quad 0
 video_mode:	.quad 0
-video_flags:	.quad 0
+realmode_flags:	.quad 0
 
 .code16
 bogus_real_magic:
@@ -367,12 +366,10 @@ ENTRY(acpi_copy_wakeup_routine)
 	pushq	%rax
 	pushq	%rdx
 
-	movl	s2ram_beep, %edx
-	movl	%edx, beep_flags - wakeup_start (,%rdi)
 	movl	saved_video_mode, %edx
 	movl	%edx, video_mode - wakeup_start (,%rdi)
-	movl	acpi_video_flags, %edx
-	movl	%edx, video_flags - wakeup_start (,%rdi)
+	movl	acpi_realmode_flags, %edx
+	movl	%edx, realmode_flags - wakeup_start (,%rdi)
 	movq	$0x12345678, real_magic - wakeup_start (,%rdi)
 	movq	$0x123456789abcdef0, %rdx
 	movq	%rdx, saved_magic
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c0ccdd720363..dc234c508a6f 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -122,8 +122,7 @@ extern struct acpi_mcfg_allocation *pci_mmcfg_config;
 extern int pci_mmcfg_config_num;
 
 extern int sbf_port;
-extern unsigned long acpi_video_flags;
-extern unsigned long s2ram_beep;
+extern unsigned long acpi_realmode_flags;
 
 #else	/* !CONFIG_ACPI */
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index c74a56436d8b..32147b57c3bf 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -332,27 +332,6 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
 
 power_attr(state);
 
-unsigned long s2ram_beep = 0;
-
-static ssize_t s2ram_beep_show(struct kset *kset, char *buf)
-{
-	return sprintf(buf, "%d\n", s2ram_beep);
-}
-
-static ssize_t
-s2ram_beep_store(struct kset *kset, const char *buf, size_t n)
-{
-	int val;
-
-	if (sscanf(buf, "%d", &val) > 0) {
-		s2ram_beep = val;
-		return n;
-	}
-	return -EINVAL;
-}
-
-power_attr(s2ram_beep);
-
 #ifdef CONFIG_PM_TRACE
 int pm_trace_enabled;
 
@@ -378,13 +357,11 @@ power_attr(pm_trace);
 static struct attribute * g[] = {
 	&state_attr.attr,
 	&pm_trace_attr.attr,
-	&s2ram_beep_attr.attr,
 	NULL,
 };
 #else
 static struct attribute * g[] = {
 	&state_attr.attr,
-	&s2ram_beep_attr.attr,
 	NULL,
 };
 #endif /* CONFIG_PM_TRACE */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 44a1d699aad7..3ed4912bf183 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -660,7 +660,7 @@ static ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_ACPI_VIDEO_FLAGS,
 		.procname	= "acpi_video_flags",
-		.data		= &acpi_video_flags,
+		.data		= &acpi_realmode_flags,
 		.maxlen		= sizeof (unsigned long),
 		.mode		= 0644,
 		.proc_handler	= &proc_doulongvec_minmax,
-- 
cgit v1.3-8-gc7d7


From e53252d97e670a38b1d2e9723b48077bba11ddda Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Thu, 19 Jul 2007 01:47:51 -0700
Subject: unregister_chrdev() return void

unregister_chrdev() does not return meaningful value.  This patch makes it
return void like most unregister_* functions.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/char_dev.c      | 3 +--
 include/linux/fs.h | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/char_dev.c b/fs/char_dev.c
index 164a45cdaf5f..bbbf07baa145 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -321,14 +321,13 @@ void unregister_chrdev_region(dev_t from, unsigned count)
 	}
 }
 
-int unregister_chrdev(unsigned int major, const char *name)
+void unregister_chrdev(unsigned int major, const char *name)
 {
 	struct char_device_struct *cd;
 	cd = __unregister_chrdev_region(major, 0, 256);
 	if (cd && cd->cdev)
 		cdev_del(cd->cdev);
 	kfree(cd);
-	return 0;
 }
 
 static DEFINE_SPINLOCK(cdev_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9562a59b3703..75dd16efc9b6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1463,7 +1463,7 @@ extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
 extern int register_chrdev_region(dev_t, unsigned, const char *);
 extern int register_chrdev(unsigned int, const char *,
 			   const struct file_operations *);
-extern int unregister_chrdev(unsigned int, const char *);
+extern void unregister_chrdev(unsigned int, const char *);
 extern void unregister_chrdev_region(dev_t, unsigned);
 extern int chrdev_open(struct inode *, struct file *);
 extern void chrdev_show(struct seq_file *,off_t);
-- 
cgit v1.3-8-gc7d7


From 2ba2d00363975242dee9bb22cf798b487e3cd61e Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Thu, 19 Jul 2007 01:47:55 -0700
Subject: AIO sparse fix (type of ki_flags)

Fix type issue reported by latest 'sparse': kiocb.ki_flags should be
"unsigned long" (not "long"), to match bitop type signature.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/aio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/aio.h b/include/linux/aio.h
index b903fc02bdb7..d10e608f232d 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -86,7 +86,7 @@ struct kioctx;
  */
 struct kiocb {
 	struct list_head	ki_run_list;
-	long			ki_flags;
+	unsigned long		ki_flags;
 	int			ki_users;
 	unsigned		ki_key;		/* id of this request */
 
-- 
cgit v1.3-8-gc7d7


From d77c2d7cc5126639a47d73300b40d461f2811a0f Mon Sep 17 00:00:00 2001
From: Fengguang Wu <wfg@mail.ustc.edu.cn>
Date: Thu, 19 Jul 2007 01:47:55 -0700
Subject: readahead: introduce PG_readahead

Introduce a new page flag: PG_readahead.

It acts as a look-ahead mark, which tells the page reader: Hey, it's time to
invoke the read-ahead logic.  For the sake of I/O pipelining, don't wait until
it runs out of cached pages!

Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Cc: Steven Pratt <slpratt@austin.ibm.com>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 5 +++++
 mm/page_alloc.c            | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 731cd2ac3227..709d92fd2877 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -83,6 +83,7 @@
 #define PG_private		11	/* If pagecache, has fs-private data */
 
 #define PG_writeback		12	/* Page is under writeback */
+#define PG_readahead		13	/* Reminder to do async read-ahead */
 #define PG_compound		14	/* Part of a compound page */
 #define PG_swapcache		15	/* Swap page: swp_entry_t in private */
 
@@ -226,6 +227,10 @@ static inline void SetPageUptodate(struct page *page)
 #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags)
 #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags)
 
+#define PageReadahead(page)	test_bit(PG_readahead, &(page)->flags)
+#define SetPageReadahead(page)	set_bit(PG_readahead, &(page)->flags)
+#define ClearPageReadahead(page) clear_bit(PG_readahead, &(page)->flags)
+
 #define PageReclaim(page)	test_bit(PG_reclaim, &(page)->flags)
 #define SetPageReclaim(page)	set_bit(PG_reclaim, &(page)->flags)
 #define ClearPageReclaim(page)	clear_bit(PG_reclaim, &(page)->flags)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e2a10b957f23..2165be9462c0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -617,7 +617,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 	if (PageReserved(page))
 		return 1;
 
-	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
+	page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead |
 			1 << PG_referenced | 1 << PG_arch_1 |
 			1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
 	set_page_private(page, 0);
-- 
cgit v1.3-8-gc7d7


From 5ce1110b92b31d079aa443e967f43a2294e01194 Mon Sep 17 00:00:00 2001
From: Fengguang Wu <wfg@mail.ustc.edu.cn>
Date: Thu, 19 Jul 2007 01:47:59 -0700
Subject: readahead: data structure and routines

Extend struct file_ra_state to support the on-demand readahead logic.  Also
define some helpers for it.

Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Cc: Steven Pratt <slpratt@austin.ibm.com>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/readahead.c     | 19 ++++++++++++++++
 2 files changed, 83 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 75dd16efc9b6..9a5f562abc77 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -695,6 +695,10 @@ struct fown_struct {
 
 /*
  * Track a single file's readahead state
+ *
+ *  ================#============|==================#==================|
+ *                  ^            ^                  ^                  ^
+ *  file_ra_state.la_index    .ra_index   .lookahead_index   .readahead_index
  */
 struct file_ra_state {
 	unsigned long start;		/* Current window */
@@ -704,6 +708,12 @@ struct file_ra_state {
 	unsigned long prev_index;	/* Cache last read() position */
 	unsigned long ahead_start;	/* Ahead window */
 	unsigned long ahead_size;
+
+	pgoff_t la_index;               /* enqueue time */
+	pgoff_t ra_index;               /* begin offset */
+	pgoff_t lookahead_index;        /* time to do next readahead */
+	pgoff_t readahead_index;        /* end offset */
+
 	unsigned long ra_pages;		/* Maximum readahead window */
 	unsigned long mmap_hit;		/* Cache hit stat for mmap accesses */
 	unsigned long mmap_miss;	/* Cache miss stat for mmap accesses */
@@ -712,6 +722,60 @@ struct file_ra_state {
 #define RA_FLAG_MISS 0x01	/* a cache miss occured against this file */
 #define RA_FLAG_INCACHE 0x02	/* file is already in cache */
 
+/*
+ * Measuring read-ahead sizes.
+ *
+ *                  |----------- readahead size ------------>|
+ *  ===#============|==================#=====================|
+ *     |------- invoke interval ------>|-- lookahead size -->|
+ */
+static inline unsigned long ra_readahead_size(struct file_ra_state *ra)
+{
+	return ra->readahead_index - ra->ra_index;
+}
+
+static inline unsigned long ra_lookahead_size(struct file_ra_state *ra)
+{
+	return ra->readahead_index - ra->lookahead_index;
+}
+
+static inline unsigned long ra_invoke_interval(struct file_ra_state *ra)
+{
+	return ra->lookahead_index - ra->la_index;
+}
+
+/*
+ * Check if @index falls in the readahead windows.
+ */
+static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
+{
+	return (index >= ra->la_index &&
+		index <  ra->readahead_index);
+}
+
+/*
+ * Where is the old read-ahead and look-ahead?
+ */
+static inline void ra_set_index(struct file_ra_state *ra,
+				pgoff_t la_index, pgoff_t ra_index)
+{
+	ra->la_index = la_index;
+	ra->ra_index = ra_index;
+}
+
+/*
+ * Where is the new read-ahead and look-ahead?
+ */
+static inline void ra_set_size(struct file_ra_state *ra,
+				unsigned long ra_size, unsigned long la_size)
+{
+	ra->readahead_index = ra->ra_index + ra_size;
+	ra->lookahead_index = ra->ra_index + ra_size - la_size;
+}
+
+unsigned long ra_submit(struct file_ra_state *ra,
+		       struct address_space *mapping, struct file *filp);
+
 struct file {
 	/*
 	 * fu_list becomes invalid after file_free is called and queued via
diff --git a/mm/readahead.c b/mm/readahead.c
index 7f9bf588c936..072ce8f8357d 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -592,3 +592,22 @@ unsigned long max_sane_readahead(unsigned long nr)
 	return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE)
 		+ node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
 }
+
+/*
+ * Submit IO for the read-ahead request in file_ra_state.
+ */
+unsigned long ra_submit(struct file_ra_state *ra,
+		       struct address_space *mapping, struct file *filp)
+{
+	unsigned long ra_size;
+	unsigned long la_size;
+	int actual;
+
+	ra_size = ra_readahead_size(ra);
+	la_size = ra_lookahead_size(ra);
+	actual = __do_page_cache_readahead(mapping, filp,
+					ra->ra_index, ra_size, la_size);
+
+	return actual;
+}
+EXPORT_SYMBOL_GPL(ra_submit);
-- 
cgit v1.3-8-gc7d7


From 122a21d11cbfda6d1e33cbc8ae9e4c4ee2f1886e Mon Sep 17 00:00:00 2001
From: Fengguang Wu <wfg@mail.ustc.edu.cn>
Date: Thu, 19 Jul 2007 01:48:01 -0700
Subject: readahead: on-demand readahead logic

This is a minimal readahead algorithm that aims to replace the current one.
It is more flexible and reliable, while maintaining almost the same behavior
and performance.  Also it is full integrated with adaptive readahead.

It is designed to be called on demand:
	- on a missing page, to do synchronous readahead
	- on a lookahead page, to do asynchronous readahead

In this way it eliminated the awkward workarounds for cache hit/miss,
readahead thrashing, retried read, and unaligned read.  It also adopts the
data structure introduced by adaptive readahead, parameterizes readahead
pipelining with `lookahead_index', and reduces the current/ahead windows to
one single window.

HEURISTICS

The logic deals with four cases:

	- sequential-next
		found a consistent readahead window, so push it forward

	- random
		standalone small read, so read as is

	- sequential-first
		create a new readahead window for a sequential/oversize request

	- lookahead-clueless
		hit a lookahead page not associated with the readahead window,
		so create a new readahead window and ramp it up

In each case, three parameters are determined:

	- readahead index: where the next readahead begins
	- readahead size:  how much to readahead
	- lookahead size:  when to do the next readahead (for pipelining)

BEHAVIORS

The old behaviors are maximally preserved for trivial sequential/random reads.
Notable changes are:

	- It no longer imposes strict sequential checks.
	  It might help some interleaved cases, and clustered random reads.
	  It does introduce risks of a random lookahead hit triggering an
	  unexpected readahead. But in general it is more likely to do good
	  than to do evil.

	- Interleaved reads are supported in a minimal way.
	  Their chances of being detected and proper handled are still low.

	- Readahead thrashings are better handled.
	  The current readahead leads to tiny average I/O sizes, because it
	  never turn back for the thrashed pages.  They have to be fault in
	  by do_generic_mapping_read() one by one.  Whereas the on-demand
	  readahead will redo readahead for them.

OVERHEADS

The new code reduced the overheads of

	- excessively calling the readahead routine on small sized reads
	  (the current readahead code insists on seeing all requests)

	- doing a lot of pointless page-cache lookups for small cached files
	  (the current readahead only turns itself off after 256 cache hits,
	  unfortunately most files are < 1MB, so never see that chance)

That accounts for speedup of
	- 0.3% on 1-page sequential reads on sparse file
	- 1.2% on 1-page cache hot sequential reads
	- 3.2% on 256-page cache hot sequential reads
	- 1.3% on cache hot `tar /lib`

However, it does introduce one extra page-cache lookup per cache miss, which
impacts random reads slightly. That's 1% overheads for 1-page random reads on
sparse file.

PERFORMANCE

The basic benchmark setup is
	- 2.6.20 kernel with on-demand readahead
	- 1MB max readahead size
	- 2.9GHz Intel Core 2 CPU
	- 2GB memory
	- 160G/8M Hitachi SATA II 7200 RPM disk

The benchmarks show that
	- it maintains the same performance for trivial sequential/random reads
	- sysbench/OLTP performance on MySQL gains up to 8%
	- performance on readahead thrashing gains up to 3 times

iozone throughput (KB/s): roughly the same
==========================================
iozone -c -t1 -s 4096m -r 64k

			       2.6.20          on-demand      gain
first run
	  "  Initial write "   61437.27        64521.53      +5.0%
	  "        Rewrite "   47893.02        48335.20      +0.9%
	  "           Read "   62111.84        62141.49      +0.0%
	  "        Re-read "   62242.66        62193.17      -0.1%
	  "   Reverse Read "   50031.46        49989.79      -0.1%
	  "    Stride read "    8657.61         8652.81      -0.1%
	  "    Random read "   13914.28        13898.23      -0.1%
	  " Mixed workload "   19069.27        19033.32      -0.2%
	  "   Random write "   14849.80        14104.38      -5.0%
	  "         Pwrite "   62955.30        65701.57      +4.4%
	  "          Pread "   62209.99        62256.26      +0.1%

second run
	  "  Initial write "   60810.31        66258.69      +9.0%
	  "        Rewrite "   49373.89        57833.66     +17.1%
	  "           Read "   62059.39        62251.28      +0.3%
	  "        Re-read "   62264.32        62256.82      -0.0%
	  "   Reverse Read "   49970.96        50565.72      +1.2%
	  "    Stride read "    8654.81         8638.45      -0.2%
	  "    Random read "   13901.44        13949.91      +0.3%
	  " Mixed workload "   19041.32        19092.04      +0.3%
	  "   Random write "   14019.99        14161.72      +1.0%
	  "         Pwrite "   64121.67        68224.17      +6.4%
	  "          Pread "   62225.08        62274.28      +0.1%

In summary, writes are unstable, reads are pretty close on average:

			  access pattern  2.6.20  on-demand   gain
				   Read  62085.61  62196.38  +0.2%
				Re-read  62253.49  62224.99  -0.0%
			   Reverse Read  50001.21  50277.75  +0.6%
			    Stride read   8656.21   8645.63  -0.1%
			    Random read  13907.86  13924.07  +0.1%
	 		 Mixed workload  19055.29  19062.68  +0.0%
				  Pread  62217.53  62265.27  +0.1%

aio-stress: roughly the same
============================
aio-stress -l -s4096 -r128 -t1 -o1 knoppix511-dvd-cn.iso
aio-stress -l -s4096 -r128 -t1 -o3 knoppix511-dvd-cn.iso

					2.6.20      on-demand  delta
			sequential	 92.57s      92.54s    -0.0%
			random		311.87s     312.15s    +0.1%

sysbench fileio: roughly the same
=================================
sysbench --test=fileio --file-io-mode=async --file-test-mode=rndrw \
	 --file-total-size=4G --file-block-size=64K \
	 --num-threads=001 --max-requests=10000 --max-time=900 run

				threads    2.6.20   on-demand    delta
		first run
				      1   59.1974s    59.2262s  +0.0%
				      2   58.0575s    58.2269s  +0.3%
				      4   48.0545s    47.1164s  -2.0%
				      8   41.0684s    41.2229s  +0.4%
				     16   35.8817s    36.4448s  +1.6%
				     32   32.6614s    32.8240s  +0.5%
				     64   23.7601s    24.1481s  +1.6%
				    128   24.3719s    23.8225s  -2.3%
				    256   23.2366s    22.0488s  -5.1%

		second run
				      1   59.6720s    59.5671s  -0.2%
				      8   41.5158s    41.9541s  +1.1%
				     64   25.0200s    23.9634s  -4.2%
				    256   22.5491s    20.9486s  -7.1%

Note that the numbers are not very stable because of the writes.
The overall performance is close when we sum all seconds up:

                sum all up               495.046s    491.514s   -0.7%

sysbench oltp (trans/sec): up to 8% gain
========================================
sysbench --test=oltp --oltp-table-size=10000000 --oltp-read-only \
	 --mysql-socket=/var/run/mysqld/mysqld.sock \
	 --mysql-user=root --mysql-password=readahead \
	 --num-threads=064 --max-requests=10000 --max-time=900 run

	10000-transactions run
				threads    2.6.20   on-demand    gain
				      1     62.81       64.56   +2.8%
				      2     67.97       70.93   +4.4%
				      4     81.81       85.87   +5.0%
				      8     94.60       97.89   +3.5%
				     16     99.07      104.68   +5.7%
				     32     95.93      104.28   +8.7%
				     64     96.48      103.68   +7.5%
	5000-transactions run
				      1     48.21       48.65   +0.9%
				      8     68.60       70.19   +2.3%
				     64     70.57       74.72   +5.9%
	2000-transactions run
				      1     37.57       38.04   +1.3%
				      2     38.43       38.99   +1.5%
				      4     45.39       46.45   +2.3%
				      8     51.64       52.36   +1.4%
				     16     54.39       55.18   +1.5%
				     32     52.13       54.49   +4.5%
				     64     54.13       54.61   +0.9%

That's interesting results. Some investigations show that
	- MySQL is accessing the db file non-uniformly: some parts are
	  more hot than others
	- It is mostly doing 4-page random reads, and sometimes doing two
	  reads in a row, the latter one triggers a 16-page readahead.
	- The on-demand readahead leaves many lookahead pages (flagged
	  PG_readahead) there. Many of them will be hit, and trigger
	  more readahead pages. Which might save more seeks.
	- Naturally, the readahead windows tend to lie in hot areas,
	  and the lookahead pages in hot areas is more likely to be hit.
	- The more overall read density, the more possible gain.

That also explains the adaptive readahead tricks for clustered random reads.

readahead thrashing: 3 times better
===================================
We boot kernel with "mem=128m single", and start a 100KB/s stream on every
second, until reaching 200 streams.

			      max throughput     min avg I/O size
		2.6.20:            5MB/s               16KB
		on-demand:        15MB/s              140KB

Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Cc: Steven Pratt <slpratt@austin.ibm.com>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |   6 ++
 mm/readahead.c     | 174 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 180 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f8e12b3b6110..619c0e80cf0c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1138,6 +1138,12 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 			pgoff_t offset, unsigned long nr_to_read);
 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 			pgoff_t offset, unsigned long nr_to_read);
+unsigned long page_cache_readahead_ondemand(struct address_space *mapping,
+			  struct file_ra_state *ra,
+			  struct file *filp,
+			  struct page *page,
+			  pgoff_t offset,
+			  unsigned long size);
 unsigned long page_cache_readahead(struct address_space *mapping,
 			  struct file_ra_state *ra,
 			  struct file *filp,
diff --git a/mm/readahead.c b/mm/readahead.c
index 072ce8f8357d..c094e4f5a250 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -611,3 +611,177 @@ unsigned long ra_submit(struct file_ra_state *ra,
 	return actual;
 }
 EXPORT_SYMBOL_GPL(ra_submit);
+
+/*
+ *  Get the previous window size, ramp it up, and
+ *  return it as the new window size.
+ */
+static unsigned long get_next_ra_size2(struct file_ra_state *ra,
+						unsigned long max)
+{
+	unsigned long cur = ra->readahead_index - ra->ra_index;
+	unsigned long newsize;
+
+	if (cur < max / 16)
+		newsize = cur * 4;
+	else
+		newsize = cur * 2;
+
+	return min(newsize, max);
+}
+
+/*
+ * On-demand readahead design.
+ *
+ * The fields in struct file_ra_state represent the most-recently-executed
+ * readahead attempt:
+ *
+ *                    |-------- last readahead window -------->|
+ *       |-- application walking here -->|
+ * ======#============|==================#=====================|
+ *       ^la_index    ^ra_index          ^lookahead_index      ^readahead_index
+ *
+ * [ra_index, readahead_index) represents the last readahead window.
+ *
+ * [la_index, lookahead_index] is where the application would be walking(in
+ * the common case of cache-cold sequential reads): the last window was
+ * established when the application was at la_index, and the next window will
+ * be bring in when the application reaches lookahead_index.
+ *
+ * To overlap application thinking time and disk I/O time, we do
+ * `readahead pipelining': Do not wait until the application consumed all
+ * readahead pages and stalled on the missing page at readahead_index;
+ * Instead, submit an asynchronous readahead I/O as early as the application
+ * reads on the page at lookahead_index. Normally lookahead_index will be
+ * equal to ra_index, for maximum pipelining.
+ *
+ * In interleaved sequential reads, concurrent streams on the same fd can
+ * be invalidating each other's readahead state. So we flag the new readahead
+ * page at lookahead_index with PG_readahead, and use it as readahead
+ * indicator. The flag won't be set on already cached pages, to avoid the
+ * readahead-for-nothing fuss, saving pointless page cache lookups.
+ *
+ * prev_index tracks the last visited page in the _previous_ read request.
+ * It should be maintained by the caller, and will be used for detecting
+ * small random reads. Note that the readahead algorithm checks loosely
+ * for sequential patterns. Hence interleaved reads might be served as
+ * sequential ones.
+ *
+ * There is a special-case: if the first page which the application tries to
+ * read happens to be the first page of the file, it is assumed that a linear
+ * read is about to happen and the window is immediately set to the initial size
+ * based on I/O request size and the max_readahead.
+ *
+ * The code ramps up the readahead size aggressively at first, but slow down as
+ * it approaches max_readhead.
+ */
+
+/*
+ * A minimal readahead algorithm for trivial sequential/random reads.
+ */
+static unsigned long
+ondemand_readahead(struct address_space *mapping,
+		   struct file_ra_state *ra, struct file *filp,
+		   struct page *page, pgoff_t offset,
+		   unsigned long req_size)
+{
+	unsigned long max;	/* max readahead pages */
+	pgoff_t ra_index;	/* readahead index */
+	unsigned long ra_size;	/* readahead size */
+	unsigned long la_size;	/* lookahead size */
+	int sequential;
+
+	max = ra->ra_pages;
+	sequential = (offset - ra->prev_index <= 1UL) || (req_size > max);
+
+	/*
+	 * Lookahead/readahead hit, assume sequential access.
+	 * Ramp up sizes, and push forward the readahead window.
+	 */
+	if (offset && (offset == ra->lookahead_index ||
+			offset == ra->readahead_index)) {
+		ra_index = ra->readahead_index;
+		ra_size = get_next_ra_size2(ra, max);
+		la_size = ra_size;
+		goto fill_ra;
+	}
+
+	/*
+	 * Standalone, small read.
+	 * Read as is, and do not pollute the readahead state.
+	 */
+	if (!page && !sequential) {
+		return __do_page_cache_readahead(mapping, filp,
+						offset, req_size, 0);
+	}
+
+	/*
+	 * It may be one of
+	 * 	- first read on start of file
+	 * 	- sequential cache miss
+	 * 	- oversize random read
+	 * Start readahead for it.
+	 */
+	ra_index = offset;
+	ra_size = get_init_ra_size(req_size, max);
+	la_size = ra_size > req_size ? ra_size - req_size : ra_size;
+
+	/*
+	 * Hit on a lookahead page without valid readahead state.
+	 * E.g. interleaved reads.
+	 * Not knowing its readahead pos/size, bet on the minimal possible one.
+	 */
+	if (page) {
+		ra_index++;
+		ra_size = min(4 * ra_size, max);
+	}
+
+fill_ra:
+	ra_set_index(ra, offset, ra_index);
+	ra_set_size(ra, ra_size, la_size);
+
+	return ra_submit(ra, mapping, filp);
+}
+
+/**
+ * page_cache_readahead_ondemand - generic file readahead
+ * @mapping: address_space which holds the pagecache and I/O vectors
+ * @ra: file_ra_state which holds the readahead state
+ * @filp: passed on to ->readpage() and ->readpages()
+ * @page: the page at @offset, or NULL if non-present
+ * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units
+ * @req_size: hint: total size of the read which the caller is performing in
+ *            PAGE_CACHE_SIZE units
+ *
+ * page_cache_readahead_ondemand() is the entry point of readahead logic.
+ * This function should be called when it is time to perform readahead:
+ * 1) @page == NULL
+ *    A cache miss happened, time for synchronous readahead.
+ * 2) @page != NULL && PageReadahead(@page)
+ *    A look-ahead hit occured, time for asynchronous readahead.
+ */
+unsigned long
+page_cache_readahead_ondemand(struct address_space *mapping,
+				struct file_ra_state *ra, struct file *filp,
+				struct page *page, pgoff_t offset,
+				unsigned long req_size)
+{
+	/* no read-ahead */
+	if (!ra->ra_pages)
+		return 0;
+
+	if (page) {
+		ClearPageReadahead(page);
+
+		/*
+		 * Defer asynchronous read-ahead on IO congestion.
+		 */
+		if (bdi_read_congested(mapping->backing_dev_info))
+			return 0;
+	}
+
+	/* do read-ahead */
+	return ondemand_readahead(mapping, ra, filp, page,
+					offset, req_size);
+}
+EXPORT_SYMBOL_GPL(page_cache_readahead_ondemand);
-- 
cgit v1.3-8-gc7d7


From c743d96b6d2ff55a94df7b5ac7c74987bb9c343b Mon Sep 17 00:00:00 2001
From: Fengguang Wu <wfg@mail.ustc.edu.cn>
Date: Thu, 19 Jul 2007 01:48:04 -0700
Subject: readahead: remove the old algorithm

Remove the old readahead algorithm.

Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Cc: Steven Pratt <slpratt@austin.ibm.com>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h |  11 +-
 include/linux/mm.h |   7 -
 mm/readahead.c     | 373 ++++-------------------------------------------------
 3 files changed, 26 insertions(+), 365 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9a5f562abc77..29cb32d3a849 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -701,14 +701,6 @@ struct fown_struct {
  *  file_ra_state.la_index    .ra_index   .lookahead_index   .readahead_index
  */
 struct file_ra_state {
-	unsigned long start;		/* Current window */
-	unsigned long size;
-	unsigned long flags;		/* ra flags RA_FLAG_xxx*/
-	unsigned long cache_hit;	/* cache hit count*/
-	unsigned long prev_index;	/* Cache last read() position */
-	unsigned long ahead_start;	/* Ahead window */
-	unsigned long ahead_size;
-
 	pgoff_t la_index;               /* enqueue time */
 	pgoff_t ra_index;               /* begin offset */
 	pgoff_t lookahead_index;        /* time to do next readahead */
@@ -717,10 +709,9 @@ struct file_ra_state {
 	unsigned long ra_pages;		/* Maximum readahead window */
 	unsigned long mmap_hit;		/* Cache hit stat for mmap accesses */
 	unsigned long mmap_miss;	/* Cache miss stat for mmap accesses */
+	unsigned long prev_index;	/* Cache last read() position */
 	unsigned int prev_offset;	/* Offset where last read() ended in a page */
 };
-#define RA_FLAG_MISS 0x01	/* a cache miss occured against this file */
-#define RA_FLAG_INCACHE 0x02	/* file is already in cache */
 
 /*
  * Measuring read-ahead sizes.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 619c0e80cf0c..3d0d7d285237 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1144,13 +1144,6 @@ unsigned long page_cache_readahead_ondemand(struct address_space *mapping,
 			  struct page *page,
 			  pgoff_t offset,
 			  unsigned long size);
-unsigned long page_cache_readahead(struct address_space *mapping,
-			  struct file_ra_state *ra,
-			  struct file *filp,
-			  pgoff_t offset,
-			  unsigned long size);
-void handle_ra_miss(struct address_space *mapping, 
-		    struct file_ra_state *ra, pgoff_t offset);
 unsigned long max_sane_readahead(unsigned long nr);
 
 /* Do stack extension */
diff --git a/mm/readahead.c b/mm/readahead.c
index c094e4f5a250..5b3c9b7d70fa 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -49,82 +49,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
 }
 EXPORT_SYMBOL_GPL(file_ra_state_init);
 
-/*
- * Return max readahead size for this inode in number-of-pages.
- */
-static inline unsigned long get_max_readahead(struct file_ra_state *ra)
-{
-	return ra->ra_pages;
-}
-
-static inline unsigned long get_min_readahead(struct file_ra_state *ra)
-{
-	return MIN_RA_PAGES;
-}
-
-static inline void reset_ahead_window(struct file_ra_state *ra)
-{
-	/*
-	 * ... but preserve ahead_start + ahead_size value,
-	 * see 'recheck:' label in page_cache_readahead().
-	 * Note: We never use ->ahead_size as rvalue without
-	 * checking ->ahead_start != 0 first.
-	 */
-	ra->ahead_size += ra->ahead_start;
-	ra->ahead_start = 0;
-}
-
-static inline void ra_off(struct file_ra_state *ra)
-{
-	ra->start = 0;
-	ra->flags = 0;
-	ra->size = 0;
-	reset_ahead_window(ra);
-	return;
-}
-
-/*
- * Set the initial window size, round to next power of 2 and square
- * for small size, x 4 for medium, and x 2 for large
- * for 128k (32 page) max ra
- * 1-8 page = 32k initial, > 8 page = 128k initial
- */
-static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
-{
-	unsigned long newsize = roundup_pow_of_two(size);
-
-	if (newsize <= max / 32)
-		newsize = newsize * 4;
-	else if (newsize <= max / 4)
-		newsize = newsize * 2;
-	else
-		newsize = max;
-	return newsize;
-}
-
-/*
- * Set the new window size, this is called only when I/O is to be submitted,
- * not for each call to readahead.  If a cache miss occured, reduce next I/O
- * size, else increase depending on how close to max we are.
- */
-static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
-{
-	unsigned long max = get_max_readahead(ra);
-	unsigned long min = get_min_readahead(ra);
-	unsigned long cur = ra->size;
-	unsigned long newsize;
-
-	if (ra->flags & RA_FLAG_MISS) {
-		ra->flags &= ~RA_FLAG_MISS;
-		newsize = max((cur - 2), min);
-	} else if (cur < max / 16) {
-		newsize = 4 * cur;
-	} else {
-		newsize = 2 * cur;
-	}
-	return min(newsize, max);
-}
-
 #define list_to_page(head) (list_entry((head)->prev, struct page, lru))
 
 /**
@@ -200,66 +124,6 @@ out:
 	return ret;
 }
 
-/*
- * Readahead design.
- *
- * The fields in struct file_ra_state represent the most-recently-executed
- * readahead attempt:
- *
- * start:	Page index at which we started the readahead
- * size:	Number of pages in that read
- *              Together, these form the "current window".
- *              Together, start and size represent the `readahead window'.
- * prev_index:  The page which the readahead algorithm most-recently inspected.
- *              It is mainly used to detect sequential file reading.
- *              If page_cache_readahead sees that it is again being called for
- *              a page which it just looked at, it can return immediately without
- *              making any state changes.
- * offset:      Offset in the prev_index where the last read ended - used for
- *              detection of sequential file reading.
- * ahead_start,
- * ahead_size:  Together, these form the "ahead window".
- * ra_pages:	The externally controlled max readahead for this fd.
- *
- * When readahead is in the off state (size == 0), readahead is disabled.
- * In this state, prev_index is used to detect the resumption of sequential I/O.
- *
- * The readahead code manages two windows - the "current" and the "ahead"
- * windows.  The intent is that while the application is walking the pages
- * in the current window, I/O is underway on the ahead window.  When the
- * current window is fully traversed, it is replaced by the ahead window
- * and the ahead window is invalidated.  When this copying happens, the
- * new current window's pages are probably still locked.  So
- * we submit a new batch of I/O immediately, creating a new ahead window.
- *
- * So:
- *
- *   ----|----------------|----------------|-----
- *       ^start           ^start+size
- *                        ^ahead_start     ^ahead_start+ahead_size
- *
- *         ^ When this page is read, we submit I/O for the
- *           ahead window.
- *
- * A `readahead hit' occurs when a read request is made against a page which is
- * the next sequential page. Ahead window calculations are done only when it
- * is time to submit a new IO.  The code ramps up the size agressively at first,
- * but slow down as it approaches max_readhead.
- *
- * Any seek/ramdom IO will result in readahead being turned off.  It will resume
- * at the first sequential access.
- *
- * There is a special-case: if the first page which the application tries to
- * read happens to be the first page of the file, it is assumed that a linear
- * read is about to happen and the window is immediately set to the initial size
- * based on I/O request size and the max_readahead.
- *
- * This function is to be called for every read request, rather than when
- * it is time to perform readahead.  It is called only once for the entire I/O
- * regardless of size unless readahead is unable to start enough I/O to satisfy
- * the request (I/O request > max_readahead).
- */
-
 /*
  * do_page_cache_readahead actually reads a chunk of disk.  It allocates all
  * the pages first, then submits them all for I/O. This avoids the very bad
@@ -295,7 +159,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	read_lock_irq(&mapping->tree_lock);
 	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
 		pgoff_t page_offset = offset + page_idx;
-		
+
 		if (page_offset > end_index)
 			break;
 
@@ -360,28 +224,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	return ret;
 }
 
-/*
- * Check how effective readahead is being.  If the amount of started IO is
- * less than expected then the file is partly or fully in pagecache and
- * readahead isn't helping.
- *
- */
-static inline int check_ra_success(struct file_ra_state *ra,
-			unsigned long nr_to_read, unsigned long actual)
-{
-	if (actual == 0) {
-		ra->cache_hit += nr_to_read;
-		if (ra->cache_hit >= VM_MAX_CACHE_HIT) {
-			ra_off(ra);
-			ra->flags |= RA_FLAG_INCACHE;
-			return 0;
-		}
-	} else {
-		ra->cache_hit=0;
-	}
-	return 1;
-}
-
 /*
  * This version skips the IO if the queue is read-congested, and will tell the
  * block layer to abandon the readahead if request allocation would block.
@@ -398,191 +240,6 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
 }
 
-/*
- * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
- * is set wait till the read completes.  Otherwise attempt to read without
- * blocking.
- * Returns 1 meaning 'success' if read is successful without switching off
- * readahead mode. Otherwise return failure.
- */
-static int
-blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
-			pgoff_t offset, unsigned long nr_to_read,
-			struct file_ra_state *ra, int block)
-{
-	int actual;
-
-	if (!block && bdi_read_congested(mapping->backing_dev_info))
-		return 0;
-
-	actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
-
-	return check_ra_success(ra, nr_to_read, actual);
-}
-
-static int make_ahead_window(struct address_space *mapping, struct file *filp,
-				struct file_ra_state *ra, int force)
-{
-	int block, ret;
-
-	ra->ahead_size = get_next_ra_size(ra);
-	ra->ahead_start = ra->start + ra->size;
-
-	block = force || (ra->prev_index >= ra->ahead_start);
-	ret = blockable_page_cache_readahead(mapping, filp,
-			ra->ahead_start, ra->ahead_size, ra, block);
-
-	if (!ret && !force) {
-		/* A read failure in blocking mode, implies pages are
-		 * all cached. So we can safely assume we have taken
-		 * care of all the pages requested in this call.
-		 * A read failure in non-blocking mode, implies we are
-		 * reading more pages than requested in this call.  So
-		 * we safely assume we have taken care of all the pages
-		 * requested in this call.
-		 *
-		 * Just reset the ahead window in case we failed due to
-		 * congestion.  The ahead window will any way be closed
-		 * in case we failed due to excessive page cache hits.
-		 */
-		reset_ahead_window(ra);
-	}
-
-	return ret;
-}
-
-/**
- * page_cache_readahead - generic adaptive readahead
- * @mapping: address_space which holds the pagecache and I/O vectors
- * @ra: file_ra_state which holds the readahead state
- * @filp: passed on to ->readpage() and ->readpages()
- * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units
- * @req_size: hint: total size of the read which the caller is performing in
- *            PAGE_CACHE_SIZE units
- *
- * page_cache_readahead() is the main function.  It performs the adaptive
- * readahead window size management and submits the readahead I/O.
- *
- * Note that @filp is purely used for passing on to the ->readpage[s]()
- * handler: it may refer to a different file from @mapping (so we may not use
- * @filp->f_mapping or @filp->f_path.dentry->d_inode here).
- * Also, @ra may not be equal to &@filp->f_ra.
- *
- */
-unsigned long
-page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
-		     struct file *filp, pgoff_t offset, unsigned long req_size)
-{
-	unsigned long max, newsize;
-	int sequential;
-
-	/*
-	 * We avoid doing extra work and bogusly perturbing the readahead
-	 * window expansion logic.
-	 */
-	if (offset == ra->prev_index && --req_size)
-		++offset;
-
-	/* Note that prev_index == -1 if it is a first read */
-	sequential = (offset == ra->prev_index + 1);
-	ra->prev_index = offset;
-	ra->prev_offset = 0;
-
-	max = get_max_readahead(ra);
-	newsize = min(req_size, max);
-
-	/* No readahead or sub-page sized read or file already in cache */
-	if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE))
-		goto out;
-
-	ra->prev_index += newsize - 1;
-
-	/*
-	 * Special case - first read at start of file. We'll assume it's
-	 * a whole-file read and grow the window fast.  Or detect first
-	 * sequential access
-	 */
-	if (sequential && ra->size == 0) {
-		ra->size = get_init_ra_size(newsize, max);
-		ra->start = offset;
-		if (!blockable_page_cache_readahead(mapping, filp, offset,
-							 ra->size, ra, 1))
-			goto out;
-
-		/*
-		 * If the request size is larger than our max readahead, we
-		 * at least want to be sure that we get 2 IOs in flight and
-		 * we know that we will definitly need the new I/O.
-		 * once we do this, subsequent calls should be able to overlap
-		 * IOs,* thus preventing stalls. so issue the ahead window
-		 * immediately.
-		 */
-		if (req_size >= max)
-			make_ahead_window(mapping, filp, ra, 1);
-
-		goto out;
-	}
-
-	/*
-	 * Now handle the random case:
-	 * partial page reads and first access were handled above,
-	 * so this must be the next page otherwise it is random
-	 */
-	if (!sequential) {
-		ra_off(ra);
-		blockable_page_cache_readahead(mapping, filp, offset,
-				 newsize, ra, 1);
-		goto out;
-	}
-
-	/*
-	 * If we get here we are doing sequential IO and this was not the first
-	 * occurence (ie we have an existing window)
-	 */
-	if (ra->ahead_start == 0) {	 /* no ahead window yet */
-		if (!make_ahead_window(mapping, filp, ra, 0))
-			goto recheck;
-	}
-
-	/*
-	 * Already have an ahead window, check if we crossed into it.
-	 * If so, shift windows and issue a new ahead window.
-	 * Only return the #pages that are in the current window, so that
-	 * we get called back on the first page of the ahead window which
-	 * will allow us to submit more IO.
-	 */
-	if (ra->prev_index >= ra->ahead_start) {
-		ra->start = ra->ahead_start;
-		ra->size = ra->ahead_size;
-		make_ahead_window(mapping, filp, ra, 0);
-recheck:
-		/* prev_index shouldn't overrun the ahead window */
-		ra->prev_index = min(ra->prev_index,
-			ra->ahead_start + ra->ahead_size - 1);
-	}
-
-out:
-	return ra->prev_index + 1;
-}
-EXPORT_SYMBOL_GPL(page_cache_readahead);
-
-/*
- * handle_ra_miss() is called when it is known that a page which should have
- * been present in the pagecache (we just did some readahead there) was in fact
- * not found.  This will happen if it was evicted by the VM (readahead
- * thrashing)
- *
- * Turn on the cache miss flag in the RA struct, this will cause the RA code
- * to reduce the RA size on the next read.
- */
-void handle_ra_miss(struct address_space *mapping,
-		struct file_ra_state *ra, pgoff_t offset)
-{
-	ra->flags |= RA_FLAG_MISS;
-	ra->flags &= ~RA_FLAG_INCACHE;
-	ra->cache_hit = 0;
-}
-
 /*
  * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
  * sensible upper limit.
@@ -612,20 +269,40 @@ unsigned long ra_submit(struct file_ra_state *ra,
 }
 EXPORT_SYMBOL_GPL(ra_submit);
 
+/*
+ * Set the initial window size, round to next power of 2 and square
+ * for small size, x 4 for medium, and x 2 for large
+ * for 128k (32 page) max ra
+ * 1-8 page = 32k initial, > 8 page = 128k initial
+ */
+static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
+{
+	unsigned long newsize = roundup_pow_of_two(size);
+
+	if (newsize <= max / 32)
+		newsize = newsize * 4;
+	else if (newsize <= max / 4)
+		newsize = newsize * 2;
+	else
+		newsize = max;
+
+	return newsize;
+}
+
 /*
  *  Get the previous window size, ramp it up, and
  *  return it as the new window size.
  */
-static unsigned long get_next_ra_size2(struct file_ra_state *ra,
+static unsigned long get_next_ra_size(struct file_ra_state *ra,
 						unsigned long max)
 {
 	unsigned long cur = ra->readahead_index - ra->ra_index;
 	unsigned long newsize;
 
 	if (cur < max / 16)
-		newsize = cur * 4;
+		newsize = 4 * cur;
 	else
-		newsize = cur * 2;
+		newsize = 2 * cur;
 
 	return min(newsize, max);
 }
@@ -701,7 +378,7 @@ ondemand_readahead(struct address_space *mapping,
 	if (offset && (offset == ra->lookahead_index ||
 			offset == ra->readahead_index)) {
 		ra_index = ra->readahead_index;
-		ra_size = get_next_ra_size2(ra, max);
+		ra_size = get_next_ra_size(ra, max);
 		la_size = ra_size;
 		goto fill_ra;
 	}
-- 
cgit v1.3-8-gc7d7


From fe3cba17c49471e99d3421e675fc8b3deaaf0b70 Mon Sep 17 00:00:00 2001
From: Fengguang Wu <wfg@mail.ustc.edu.cn>
Date: Thu, 19 Jul 2007 01:48:07 -0700
Subject: mm: share PG_readahead and PG_reclaim

Share the same page flag bit for PG_readahead and PG_reclaim.

One is used only on file reads, another is only for emergency writes.  One
is used mostly for fresh/young pages, another is for old pages.

Combinations of possible interactions are:

a) clear PG_reclaim => implicit clear of PG_readahead
	it will delay an asynchronous readahead into a synchronous one
	it actually does _good_ for readahead:
		the pages will be reclaimed soon, it's readahead thrashing!
		in this case, synchronous readahead makes more sense.

b) clear PG_readahead => implicit clear of PG_reclaim
	one(and only one) page will not be reclaimed in time
	it can be avoided by checking PageWriteback(page) in readahead first

c) set PG_reclaim => implicit set of PG_readahead
	will confuse readahead and make it restart the size rampup process
	it's a trivial problem, and can mostly be avoided by checking
	PageWriteback(page) first in readahead

d) set PG_readahead => implicit set of PG_reclaim
	PG_readahead will never be set on already cached pages.
	PG_reclaim will always be cleared on dirtying a page.
	so not a problem.

In summary,
	a)   we get better behavior
	b,d) possible interactions can be avoided
	c)   racy condition exists that might affect readahead, but the chance
	     is _really_ low, and the hurt on readahead is trivial.

Compound pages also use PG_reclaim, but for now they do not interact with
reclaim/readahead code.

Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 4 +++-
 mm/page-writeback.c        | 1 +
 mm/page_alloc.c            | 7 -------
 mm/readahead.c             | 6 ++++++
 4 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 709d92fd2877..a454176c3e30 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -83,7 +83,6 @@
 #define PG_private		11	/* If pagecache, has fs-private data */
 
 #define PG_writeback		12	/* Page is under writeback */
-#define PG_readahead		13	/* Reminder to do async read-ahead */
 #define PG_compound		14	/* Part of a compound page */
 #define PG_swapcache		15	/* Swap page: swp_entry_t in private */
 
@@ -91,6 +90,9 @@
 #define PG_reclaim		17	/* To be reclaimed asap */
 #define PG_buddy		19	/* Page is free, on buddy lists */
 
+/* PG_readahead is only used for file reads; PG_reclaim is only for writes */
+#define PG_readahead		PG_reclaim /* Reminder to do async read-ahead */
+
 /* PG_owner_priv_1 users should have descriptive aliases */
 #define PG_checked		PG_owner_priv_1 /* Used by some filesystems */
 #define PG_pinned		PG_owner_priv_1	/* Xen pinned pagetable */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e62482718012..51b3eb6ab445 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -920,6 +920,7 @@ int clear_page_dirty_for_io(struct page *page)
 
 	BUG_ON(!PageLocked(page));
 
+	ClearPageReclaim(page);
 	if (mapping && mapping_cap_account_dirty(mapping)) {
 		/*
 		 * Yes, Virginia, this is indeed insane.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2165be9462c0..43cb3b3e1679 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -453,12 +453,6 @@ static inline int free_pages_check(struct page *page)
 			1 << PG_reserved |
 			1 << PG_buddy ))))
 		bad_page(page);
-	/*
-	 * PageReclaim == PageTail. It is only an error
-	 * for PageReclaim to be set if PageCompound is clear.
-	 */
-	if (unlikely(!PageCompound(page) && PageReclaim(page)))
-		bad_page(page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
 	/*
@@ -602,7 +596,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 			1 << PG_locked	|
 			1 << PG_active	|
 			1 << PG_dirty	|
-			1 << PG_reclaim	|
 			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback |
diff --git a/mm/readahead.c b/mm/readahead.c
index 5b3c9b7d70fa..205a4a431516 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -448,6 +448,12 @@ page_cache_readahead_ondemand(struct address_space *mapping,
 		return 0;
 
 	if (page) {
+		/*
+		 * It can be PG_reclaim.
+		 */
+		if (PageWriteback(page))
+			return 0;
+
 		ClearPageReadahead(page);
 
 		/*
-- 
cgit v1.3-8-gc7d7


From cf914a7d656e62b9dd3e0dffe4f62b953ae6048d Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 19 Jul 2007 01:48:08 -0700
Subject: readahead: split ondemand readahead interface into two functions

Split ondemand readahead interface into two functions.  I think this makes it
a little clearer for non-readahead experts (like Rusty).

Internally they both call ondemand_readahead(), but the page argument is
changed to an obvious boolean flag.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/dir.c      |  4 +--
 fs/ext4/dir.c      |  4 +--
 fs/splice.c        |  6 ++--
 include/linux/mm.h | 20 +++++++----
 mm/filemap.c       | 10 +++---
 mm/readahead.c     | 97 +++++++++++++++++++++++++++++++++---------------------
 6 files changed, 85 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 3c6d384a2c66..c00723a99f44 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -139,10 +139,10 @@ static int ext3_readdir(struct file * filp,
 			pgoff_t index = map_bh.b_blocknr >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
 			if (!ra_has_index(&filp->f_ra, index))
-				page_cache_readahead_ondemand(
+				page_cache_sync_readahead(
 					sb->s_bdev->bd_inode->i_mapping,
 					&filp->f_ra, filp,
-					NULL, index, 1);
+					index, 1);
 			filp->f_ra.prev_index = index;
 			bh = ext3_bread(NULL, inode, blk, 0, &err);
 		}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 0a872a09fed8..3ab01c04e00c 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -138,10 +138,10 @@ static int ext4_readdir(struct file * filp,
 			pgoff_t index = map_bh.b_blocknr >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
 			if (!ra_has_index(&filp->f_ra, index))
-				page_cache_readahead_ondemand(
+				page_cache_sync_readahead(
 					sb->s_bdev->bd_inode->i_mapping,
 					&filp->f_ra, filp,
-					NULL, index, 1);
+					index, 1);
 			filp->f_ra.prev_index = index;
 			bh = ext4_bread(NULL, inode, blk, 0, &err);
 		}
diff --git a/fs/splice.c b/fs/splice.c
index 6ddd0329f866..22496d2a73fa 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -295,8 +295,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 	 * readahead/allocate the rest and fill in the holes.
 	 */
 	if (spd.nr_pages < nr_pages)
-		page_cache_readahead_ondemand(mapping, &in->f_ra, in,
-				NULL, index, req_pages - spd.nr_pages);
+		page_cache_sync_readahead(mapping, &in->f_ra, in,
+				index, req_pages - spd.nr_pages);
 
 	error = 0;
 	while (spd.nr_pages < nr_pages) {
@@ -352,7 +352,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		page = pages[page_nr];
 
 		if (PageReadahead(page))
-			page_cache_readahead_ondemand(mapping, &in->f_ra, in,
+			page_cache_async_readahead(mapping, &in->f_ra, in,
 					page, index, req_pages - page_nr);
 
 		/*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3d0d7d285237..50a0ed1d1806 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1138,12 +1138,20 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 			pgoff_t offset, unsigned long nr_to_read);
 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 			pgoff_t offset, unsigned long nr_to_read);
-unsigned long page_cache_readahead_ondemand(struct address_space *mapping,
-			  struct file_ra_state *ra,
-			  struct file *filp,
-			  struct page *page,
-			  pgoff_t offset,
-			  unsigned long size);
+
+void page_cache_sync_readahead(struct address_space *mapping,
+			       struct file_ra_state *ra,
+			       struct file *filp,
+			       pgoff_t offset,
+			       unsigned long size);
+
+void page_cache_async_readahead(struct address_space *mapping,
+				struct file_ra_state *ra,
+				struct file *filp,
+				struct page *pg,
+				pgoff_t offset,
+				unsigned long size);
+
 unsigned long max_sane_readahead(unsigned long nr);
 
 /* Do stack extension */
diff --git a/mm/filemap.c b/mm/filemap.c
index 5eb0a6b9d607..49a6fe375d01 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -894,15 +894,15 @@ void do_generic_mapping_read(struct address_space *mapping,
 find_page:
 		page = find_get_page(mapping, index);
 		if (!page) {
-			page_cache_readahead_ondemand(mapping,
-					&ra, filp, page,
+			page_cache_sync_readahead(mapping,
+					&ra, filp,
 					index, last_index - index);
 			page = find_get_page(mapping, index);
 			if (unlikely(page == NULL))
 				goto no_cached_page;
 		}
 		if (PageReadahead(page)) {
-			page_cache_readahead_ondemand(mapping,
+			page_cache_async_readahead(mapping,
 					&ra, filp, page,
 					index, last_index - index);
 		}
@@ -1348,14 +1348,14 @@ retry_find:
 	 */
 	if (VM_SequentialReadHint(vma)) {
 		if (!page) {
-			page_cache_readahead_ondemand(mapping, ra, file, page,
+			page_cache_sync_readahead(mapping, ra, file,
 							   vmf->pgoff, 1);
 			page = find_lock_page(mapping, vmf->pgoff);
 			if (!page)
 				goto no_cached_page;
 		}
 		if (PageReadahead(page)) {
-			page_cache_readahead_ondemand(mapping, ra, file, page,
+			page_cache_async_readahead(mapping, ra, file, page,
 							   vmf->pgoff, 1);
 		}
 	}
diff --git a/mm/readahead.c b/mm/readahead.c
index 205a4a431516..3d262bb738a9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -359,7 +359,7 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
 static unsigned long
 ondemand_readahead(struct address_space *mapping,
 		   struct file_ra_state *ra, struct file *filp,
-		   struct page *page, pgoff_t offset,
+		   bool hit_readahead_marker, pgoff_t offset,
 		   unsigned long req_size)
 {
 	unsigned long max;	/* max readahead pages */
@@ -387,7 +387,7 @@ ondemand_readahead(struct address_space *mapping,
 	 * Standalone, small read.
 	 * Read as is, and do not pollute the readahead state.
 	 */
-	if (!page && !sequential) {
+	if (!hit_readahead_marker && !sequential) {
 		return __do_page_cache_readahead(mapping, filp,
 						offset, req_size, 0);
 	}
@@ -408,7 +408,7 @@ ondemand_readahead(struct address_space *mapping,
 	 * E.g. interleaved reads.
 	 * Not knowing its readahead pos/size, bet on the minimal possible one.
 	 */
-	if (page) {
+	if (hit_readahead_marker) {
 		ra_index++;
 		ra_size = min(4 * ra_size, max);
 	}
@@ -421,50 +421,71 @@ fill_ra:
 }
 
 /**
- * page_cache_readahead_ondemand - generic file readahead
+ * page_cache_sync_readahead - generic file readahead
  * @mapping: address_space which holds the pagecache and I/O vectors
  * @ra: file_ra_state which holds the readahead state
  * @filp: passed on to ->readpage() and ->readpages()
- * @page: the page at @offset, or NULL if non-present
- * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units
+ * @offset: start offset into @mapping, in pagecache page-sized units
  * @req_size: hint: total size of the read which the caller is performing in
- *            PAGE_CACHE_SIZE units
+ *            pagecache pages
  *
- * page_cache_readahead_ondemand() is the entry point of readahead logic.
- * This function should be called when it is time to perform readahead:
- * 1) @page == NULL
- *    A cache miss happened, time for synchronous readahead.
- * 2) @page != NULL && PageReadahead(@page)
- *    A look-ahead hit occured, time for asynchronous readahead.
+ * page_cache_sync_readahead() should be called when a cache miss happened:
+ * it will submit the read.  The readahead logic may decide to piggyback more
+ * pages onto the read request if access patterns suggest it will improve
+ * performance.
  */
-unsigned long
-page_cache_readahead_ondemand(struct address_space *mapping,
-				struct file_ra_state *ra, struct file *filp,
-				struct page *page, pgoff_t offset,
-				unsigned long req_size)
+void page_cache_sync_readahead(struct address_space *mapping,
+			       struct file_ra_state *ra, struct file *filp,
+			       pgoff_t offset, unsigned long req_size)
 {
 	/* no read-ahead */
 	if (!ra->ra_pages)
-		return 0;
-
-	if (page) {
-		/*
-		 * It can be PG_reclaim.
-		 */
-		if (PageWriteback(page))
-			return 0;
-
-		ClearPageReadahead(page);
-
-		/*
-		 * Defer asynchronous read-ahead on IO congestion.
-		 */
-		if (bdi_read_congested(mapping->backing_dev_info))
-			return 0;
-	}
+		return;
+
+	/* do read-ahead */
+	ondemand_readahead(mapping, ra, filp, false, offset, req_size);
+}
+EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
+
+/**
+ * page_cache_async_readahead - file readahead for marked pages
+ * @mapping: address_space which holds the pagecache and I/O vectors
+ * @ra: file_ra_state which holds the readahead state
+ * @filp: passed on to ->readpage() and ->readpages()
+ * @page: the page at @offset which has the PG_readahead flag set
+ * @offset: start offset into @mapping, in pagecache page-sized units
+ * @req_size: hint: total size of the read which the caller is performing in
+ *            pagecache pages
+ *
+ * page_cache_async_ondemand() should be called when a page is used which
+ * has the PG_readahead flag: this is a marker to suggest that the application
+ * has used up enough of the readahead window that we should start pulling in
+ * more pages. */
+void
+page_cache_async_readahead(struct address_space *mapping,
+			   struct file_ra_state *ra, struct file *filp,
+			   struct page *page, pgoff_t offset,
+			   unsigned long req_size)
+{
+	/* no read-ahead */
+	if (!ra->ra_pages)
+		return;
+
+	/*
+	 * Same bit is used for PG_readahead and PG_reclaim.
+	 */
+	if (PageWriteback(page))
+		return;
+
+	ClearPageReadahead(page);
+
+	/*
+	 * Defer asynchronous read-ahead on IO congestion.
+	 */
+	if (bdi_read_congested(mapping->backing_dev_info))
+		return;
 
 	/* do read-ahead */
-	return ondemand_readahead(mapping, ra, filp, page,
-					offset, req_size);
+	ondemand_readahead(mapping, ra, filp, true, offset, req_size);
 }
-EXPORT_SYMBOL_GPL(page_cache_readahead_ondemand);
+EXPORT_SYMBOL_GPL(page_cache_async_readahead);
-- 
cgit v1.3-8-gc7d7


From f9acc8c7b35a100f3a9e0e6977f7807b0169f9a5 Mon Sep 17 00:00:00 2001
From: Fengguang Wu <wfg@mail.ustc.edu.cn>
Date: Thu, 19 Jul 2007 01:48:08 -0700
Subject: readahead: sanify file_ra_state names

Rename some file_ra_state variables and remove some accessors.

It results in much simpler code.
Kudos to Rusty!

Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 61 +++++-------------------------------------------
 mm/readahead.c     | 68 ++++++++++++++++++++----------------------------------
 2 files changed, 31 insertions(+), 98 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 29cb32d3a849..d33beadd9a43 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -695,16 +695,12 @@ struct fown_struct {
 
 /*
  * Track a single file's readahead state
- *
- *  ================#============|==================#==================|
- *                  ^            ^                  ^                  ^
- *  file_ra_state.la_index    .ra_index   .lookahead_index   .readahead_index
  */
 struct file_ra_state {
-	pgoff_t la_index;               /* enqueue time */
-	pgoff_t ra_index;               /* begin offset */
-	pgoff_t lookahead_index;        /* time to do next readahead */
-	pgoff_t readahead_index;        /* end offset */
+	pgoff_t start;                  /* where readahead started */
+	unsigned long size;             /* # of readahead pages */
+	unsigned long async_size;       /* do asynchronous readahead when
+					   there are only # of pages ahead */
 
 	unsigned long ra_pages;		/* Maximum readahead window */
 	unsigned long mmap_hit;		/* Cache hit stat for mmap accesses */
@@ -713,60 +709,15 @@ struct file_ra_state {
 	unsigned int prev_offset;	/* Offset where last read() ended in a page */
 };
 
-/*
- * Measuring read-ahead sizes.
- *
- *                  |----------- readahead size ------------>|
- *  ===#============|==================#=====================|
- *     |------- invoke interval ------>|-- lookahead size -->|
- */
-static inline unsigned long ra_readahead_size(struct file_ra_state *ra)
-{
-	return ra->readahead_index - ra->ra_index;
-}
-
-static inline unsigned long ra_lookahead_size(struct file_ra_state *ra)
-{
-	return ra->readahead_index - ra->lookahead_index;
-}
-
-static inline unsigned long ra_invoke_interval(struct file_ra_state *ra)
-{
-	return ra->lookahead_index - ra->la_index;
-}
-
 /*
  * Check if @index falls in the readahead windows.
  */
 static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
 {
-	return (index >= ra->la_index &&
-		index <  ra->readahead_index);
-}
-
-/*
- * Where is the old read-ahead and look-ahead?
- */
-static inline void ra_set_index(struct file_ra_state *ra,
-				pgoff_t la_index, pgoff_t ra_index)
-{
-	ra->la_index = la_index;
-	ra->ra_index = ra_index;
+	return (index >= ra->start &&
+		index <  ra->start + ra->size);
 }
 
-/*
- * Where is the new read-ahead and look-ahead?
- */
-static inline void ra_set_size(struct file_ra_state *ra,
-				unsigned long ra_size, unsigned long la_size)
-{
-	ra->readahead_index = ra->ra_index + ra_size;
-	ra->lookahead_index = ra->ra_index + ra_size - la_size;
-}
-
-unsigned long ra_submit(struct file_ra_state *ra,
-		       struct address_space *mapping, struct file *filp);
-
 struct file {
 	/*
 	 * fu_list becomes invalid after file_free is called and queued via
diff --git a/mm/readahead.c b/mm/readahead.c
index 3d262bb738a9..39bf45d43320 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -253,21 +253,16 @@ unsigned long max_sane_readahead(unsigned long nr)
 /*
  * Submit IO for the read-ahead request in file_ra_state.
  */
-unsigned long ra_submit(struct file_ra_state *ra,
+static unsigned long ra_submit(struct file_ra_state *ra,
 		       struct address_space *mapping, struct file *filp)
 {
-	unsigned long ra_size;
-	unsigned long la_size;
 	int actual;
 
-	ra_size = ra_readahead_size(ra);
-	la_size = ra_lookahead_size(ra);
 	actual = __do_page_cache_readahead(mapping, filp,
-					ra->ra_index, ra_size, la_size);
+					ra->start, ra->size, ra->async_size);
 
 	return actual;
 }
-EXPORT_SYMBOL_GPL(ra_submit);
 
 /*
  * Set the initial window size, round to next power of 2 and square
@@ -296,7 +291,7 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
 static unsigned long get_next_ra_size(struct file_ra_state *ra,
 						unsigned long max)
 {
-	unsigned long cur = ra->readahead_index - ra->ra_index;
+	unsigned long cur = ra->size;
 	unsigned long newsize;
 
 	if (cur < max / 16)
@@ -313,28 +308,21 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
  * The fields in struct file_ra_state represent the most-recently-executed
  * readahead attempt:
  *
- *                    |-------- last readahead window -------->|
- *       |-- application walking here -->|
- * ======#============|==================#=====================|
- *       ^la_index    ^ra_index          ^lookahead_index      ^readahead_index
- *
- * [ra_index, readahead_index) represents the last readahead window.
- *
- * [la_index, lookahead_index] is where the application would be walking(in
- * the common case of cache-cold sequential reads): the last window was
- * established when the application was at la_index, and the next window will
- * be bring in when the application reaches lookahead_index.
+ *                        |<----- async_size ---------|
+ *     |------------------- size -------------------->|
+ *     |==================#===========================|
+ *     ^start             ^page marked with PG_readahead
  *
  * To overlap application thinking time and disk I/O time, we do
  * `readahead pipelining': Do not wait until the application consumed all
  * readahead pages and stalled on the missing page at readahead_index;
- * Instead, submit an asynchronous readahead I/O as early as the application
- * reads on the page at lookahead_index. Normally lookahead_index will be
- * equal to ra_index, for maximum pipelining.
+ * Instead, submit an asynchronous readahead I/O as soon as there are
+ * only async_size pages left in the readahead window. Normally async_size
+ * will be equal to size, for maximum pipelining.
  *
  * In interleaved sequential reads, concurrent streams on the same fd can
  * be invalidating each other's readahead state. So we flag the new readahead
- * page at lookahead_index with PG_readahead, and use it as readahead
+ * page at (start+size-async_size) with PG_readahead, and use it as readahead
  * indicator. The flag won't be set on already cached pages, to avoid the
  * readahead-for-nothing fuss, saving pointless page cache lookups.
  *
@@ -363,24 +351,21 @@ ondemand_readahead(struct address_space *mapping,
 		   unsigned long req_size)
 {
 	unsigned long max;	/* max readahead pages */
-	pgoff_t ra_index;	/* readahead index */
-	unsigned long ra_size;	/* readahead size */
-	unsigned long la_size;	/* lookahead size */
 	int sequential;
 
 	max = ra->ra_pages;
 	sequential = (offset - ra->prev_index <= 1UL) || (req_size > max);
 
 	/*
-	 * Lookahead/readahead hit, assume sequential access.
+	 * It's the expected callback offset, assume sequential access.
 	 * Ramp up sizes, and push forward the readahead window.
 	 */
-	if (offset && (offset == ra->lookahead_index ||
-			offset == ra->readahead_index)) {
-		ra_index = ra->readahead_index;
-		ra_size = get_next_ra_size(ra, max);
-		la_size = ra_size;
-		goto fill_ra;
+	if (offset && (offset == (ra->start + ra->size - ra->async_size) ||
+			offset == (ra->start + ra->size))) {
+		ra->start += ra->size;
+		ra->size = get_next_ra_size(ra, max);
+		ra->async_size = ra->size;
+		goto readit;
 	}
 
 	/*
@@ -399,24 +384,21 @@ ondemand_readahead(struct address_space *mapping,
 	 * 	- oversize random read
 	 * Start readahead for it.
 	 */
-	ra_index = offset;
-	ra_size = get_init_ra_size(req_size, max);
-	la_size = ra_size > req_size ? ra_size - req_size : ra_size;
+	ra->start = offset;
+	ra->size = get_init_ra_size(req_size, max);
+	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
 
 	/*
-	 * Hit on a lookahead page without valid readahead state.
+	 * Hit on a marked page without valid readahead state.
 	 * E.g. interleaved reads.
 	 * Not knowing its readahead pos/size, bet on the minimal possible one.
 	 */
 	if (hit_readahead_marker) {
-		ra_index++;
-		ra_size = min(4 * ra_size, max);
+		ra->start++;
+		ra->size = get_next_ra_size(ra, max);
 	}
 
-fill_ra:
-	ra_set_index(ra, offset, ra_index);
-	ra_set_size(ra, ra_size, la_size);
-
+readit:
 	return ra_submit(ra, mapping, filp);
 }
 
-- 
cgit v1.3-8-gc7d7


From 81eae375eceba481ca4c605d42913871f093f6d5 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Thu, 19 Jul 2007 01:48:09 -0700
Subject: jprobes: make struct jprobe.entry a void *

Currently jprobe.entry is a kprobe_opcode_t *, but that's a lie.  On some
platforms it doesn't point to an opcode at all, it points to a function
descriptor.

It's really a pointer to something that the arch code can turn into a function
entry point.  And that's what actually happens, none of the generic code ever
looks at jprobe.entry, it's only ever dereferenced by arch code.

So just make it a void *.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Cc: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kprobes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 23adf6075ae4..f4e53b71d23f 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -116,7 +116,7 @@ struct kprobe {
  */
 struct jprobe {
 	struct kprobe kp;
-	kprobe_opcode_t *entry;	/* probe handling code to jump to */
+	void *entry;	/* probe handling code to jump to */
 };
 
 DECLARE_PER_CPU(struct kprobe *, current_kprobe);
-- 
cgit v1.3-8-gc7d7


From 9e367d859297b9377d65574f538cf52730e9eda8 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Thu, 19 Jul 2007 01:48:10 -0700
Subject: jprobes: remove JPROBE_ENTRY()

AFAICT now that jprobe.entry is a void *, JPROBE_ENTRY doesn't do anything
useful - so remove it ..

I've left a do-nothing version so that out-of-tree jprobes code will still
compile without modifications.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Cc: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kprobes.txt     | 8 +-------
 include/asm-i386/kprobes.h    | 1 -
 include/asm-ia64/kprobes.h    | 2 --
 include/asm-powerpc/kprobes.h | 2 --
 include/asm-s390/kprobes.h    | 2 --
 include/asm-sparc64/kprobes.h | 1 -
 include/asm-x86_64/kprobes.h  | 1 -
 include/linux/kprobes.h       | 3 +++
 net/dccp/probe.c              | 2 +-
 net/ipv4/tcp_probe.c          | 2 +-
 10 files changed, 6 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index da5404ab7569..cb12ae175aa2 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -247,12 +247,6 @@ control to Kprobes.)  If the probed function is declared asmlinkage,
 fastcall, or anything else that affects how args are passed, the
 handler's declaration must match.
 
-NOTE: A macro JPROBE_ENTRY is provided to handle architecture-specific
-aliasing of jp->entry. In the interest of portability, it is advised
-to use:
-
-	jp->entry = JPROBE_ENTRY(handler);
-
 register_jprobe() returns 0 on success, or a negative errno otherwise.
 
 4.3 register_kretprobe
@@ -518,7 +512,7 @@ long jdo_fork(unsigned long clone_flags, unsigned long stack_start,
 }
 
 static struct jprobe my_jprobe = {
-	.entry = JPROBE_ENTRY(jdo_fork)
+	.entry = jdo_fork
 };
 
 static int __init jprobe_init(void)
diff --git a/include/asm-i386/kprobes.h b/include/asm-i386/kprobes.h
index 8774d06689da..06f7303c30ca 100644
--- a/include/asm-i386/kprobes.h
+++ b/include/asm-i386/kprobes.h
@@ -42,7 +42,6 @@ typedef u8 kprobe_opcode_t;
 	? (MAX_STACK_SIZE) \
 	: (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR)))
 
-#define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)pentry
 #define ARCH_SUPPORTS_KRETPROBES
 #define  ARCH_INACTIVE_KPROBE_COUNT 0
 #define flush_insn_slot(p)	do { } while (0)
diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h
index 6382e52ec227..067d9dea68f9 100644
--- a/include/asm-ia64/kprobes.h
+++ b/include/asm-ia64/kprobes.h
@@ -82,8 +82,6 @@ struct kprobe_ctlblk {
 	struct prev_kprobe prev_kprobe[ARCH_PREV_KPROBE_SZ];
 };
 
-#define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)pentry
-
 #define ARCH_SUPPORTS_KRETPROBES
 #define  ARCH_INACTIVE_KPROBE_COUNT 1
 
diff --git a/include/asm-powerpc/kprobes.h b/include/asm-powerpc/kprobes.h
index 9537fda238b8..8b08b447d6f3 100644
--- a/include/asm-powerpc/kprobes.h
+++ b/include/asm-powerpc/kprobes.h
@@ -73,12 +73,10 @@ typedef unsigned int kprobe_opcode_t;
 	}								\
 }
 
-#define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)((func_descr_t *)pentry)
 #define is_trap(instr)	(IS_TW(instr) || IS_TD(instr) || \
 			IS_TWI(instr) || IS_TDI(instr))
 #else
 /* Use stock kprobe_lookup_name since ppc32 doesn't use function descriptors */
-#define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)(pentry)
 #define is_trap(instr)	(IS_TW(instr) || IS_TWI(instr))
 #endif
 
diff --git a/include/asm-s390/kprobes.h b/include/asm-s390/kprobes.h
index 830fe4c4eea6..340ba10446ea 100644
--- a/include/asm-s390/kprobes.h
+++ b/include/asm-s390/kprobes.h
@@ -46,8 +46,6 @@ typedef u16 kprobe_opcode_t;
 	? (MAX_STACK_SIZE) \
 	: (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR)))
 
-#define JPROBE_ENTRY(pentry) (kprobe_opcode_t *)(pentry)
-
 #define ARCH_SUPPORTS_KRETPROBES
 #define ARCH_INACTIVE_KPROBE_COUNT 0
 
diff --git a/include/asm-sparc64/kprobes.h b/include/asm-sparc64/kprobes.h
index a331b7b0dff2..7f6774dca5f4 100644
--- a/include/asm-sparc64/kprobes.h
+++ b/include/asm-sparc64/kprobes.h
@@ -10,7 +10,6 @@ typedef u32 kprobe_opcode_t;
 #define BREAKPOINT_INSTRUCTION_2 0x91d02071 /* ta 0x71 */
 #define MAX_INSN_SIZE 2
 
-#define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)pentry
 #define arch_remove_kprobe(p)	do {} while (0)
 #define  ARCH_INACTIVE_KPROBE_COUNT 0
 
diff --git a/include/asm-x86_64/kprobes.h b/include/asm-x86_64/kprobes.h
index cf5317898fb0..7db825403e01 100644
--- a/include/asm-x86_64/kprobes.h
+++ b/include/asm-x86_64/kprobes.h
@@ -41,7 +41,6 @@ typedef u8 kprobe_opcode_t;
 	? (MAX_STACK_SIZE) \
 	: (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR)))
 
-#define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)pentry
 #define ARCH_SUPPORTS_KRETPROBES
 #define  ARCH_INACTIVE_KPROBE_COUNT 1
 
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index f4e53b71d23f..bd892850c94a 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -119,6 +119,9 @@ struct jprobe {
 	void *entry;	/* probe handling code to jump to */
 };
 
+/* For backward compatibility with old code using JPROBE_ENTRY() */
+#define JPROBE_ENTRY(handler)	(handler)
+
 DECLARE_PER_CPU(struct kprobe *, current_kprobe);
 DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index 43a3adb027e7..bae10b0f2fc3 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -112,7 +112,7 @@ static struct jprobe dccp_send_probe = {
 	.kp	= {
 		.symbol_name = "dccp_sendmsg",
 	},
-	.entry	= JPROBE_ENTRY(jdccp_sendmsg),
+	.entry	= jdccp_sendmsg,
 };
 
 static int dccpprobe_open(struct inode *inode, struct file *file)
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index f37d5928921a..b76398d1b454 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -130,7 +130,7 @@ static struct jprobe tcp_jprobe = {
 	.kp = {
 		.symbol_name	= "tcp_rcv_established",
 	},
-	.entry	= JPROBE_ENTRY(jtcp_rcv_established),
+	.entry	= jtcp_rcv_established,
 };
 
 static int tcpprobe_open(struct inode * inode, struct file * file)
-- 
cgit v1.3-8-gc7d7


From 3d7e33825d8799115dd2495c9944badd3272a623 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Thu, 19 Jul 2007 01:48:11 -0700
Subject: jprobes: make jprobes a little safer for users

I realise jprobes are a razor-blades-included type of interface, but that
doesn't mean we can't try and make them safer to use.  This guy I know once
wrote code like this:

struct jprobe jp = { .kp.symbol_name = "foo", .entry = "jprobe_foo" };

And then his kernel exploded. Oops.

This patch adds an arch hook, arch_deref_entry_point() (I don't like it
either) which takes the void * in a struct jprobe, and gives back the text
address that it represents.

We can then use that in register_jprobe() to check that the entry point we're
passed is actually in the kernel text, rather than just some random value.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Cc: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/kprobes.c    |  7 ++++++-
 arch/powerpc/kernel/kprobes.c | 11 ++++++++---
 include/linux/kprobes.h       |  1 +
 kernel/kprobes.c              |  9 +++++++++
 4 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 5bc46f151344..5dc98b5abcfb 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -936,10 +936,15 @@ static void ia64_get_bsp_cfm(struct unw_frame_info *info, void *arg)
 	return;
 }
 
+unsigned long arch_deref_entry_point(void *entry)
+{
+	return ((struct fnptr *)entry)->ip;
+}
+
 int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
-	unsigned long addr = ((struct fnptr *)(jp->entry))->ip;
+	unsigned long addr = arch_deref_entry_point(jp->entry);
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 	struct param_bsp_cfm pa;
 	int bytes;
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 0c96611f02f4..440f5a87271f 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -492,6 +492,13 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	return ret;
 }
 
+#ifdef CONFIG_PPC64
+unsigned long arch_deref_entry_point(void *entry)
+{
+	return (unsigned long)(((func_descr_t *)entry)->entry);
+}
+#endif
+
 int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
@@ -500,11 +507,9 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	memcpy(&kcb->jprobe_saved_regs, regs, sizeof(struct pt_regs));
 
 	/* setup return addr to the jprobe handler routine */
+	regs->nip = arch_deref_entry_point(jp->entry);
 #ifdef CONFIG_PPC64
-	regs->nip = (unsigned long)(((func_descr_t *)jp->entry)->entry);
 	regs->gpr[2] = (unsigned long)(((func_descr_t *)jp->entry)->toc);
-#else
-	regs->nip = (unsigned long)jp->entry;
 #endif
 
 	return 1;
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index bd892850c94a..51464d12a4e5 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -214,6 +214,7 @@ int longjmp_break_handler(struct kprobe *, struct pt_regs *);
 int register_jprobe(struct jprobe *p);
 void unregister_jprobe(struct jprobe *p);
 void jprobe_return(void);
+unsigned long arch_deref_entry_point(void *);
 
 int register_kretprobe(struct kretprobe *rp);
 void unregister_kretprobe(struct kretprobe *rp);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9e47d8c493f3..3e9f513a728d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -675,9 +675,18 @@ static struct notifier_block kprobe_exceptions_nb = {
 	.priority = 0x7fffffff /* we need to be notified first */
 };
 
+unsigned long __weak arch_deref_entry_point(void *entry)
+{
+	return (unsigned long)entry;
+}
 
 int __kprobes register_jprobe(struct jprobe *jp)
 {
+	unsigned long addr = arch_deref_entry_point(jp->entry);
+
+	if (!kernel_text_address(addr))
+		return -EINVAL;
+
 	/* Todo: Verify probepoint is a function entry point */
 	jp->kp.pre_handler = setjmp_pre_handler;
 	jp->kp.break_handler = longjmp_break_handler;
-- 
cgit v1.3-8-gc7d7


From bdf4c48af20a3b0f01671799ace345e3d49576da Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jul 2007 01:48:15 -0700
Subject: audit: rework execve audit

The purpose of audit_bprm() is to log the argv array to a userspace daemon at
the end of the execve system call.  Since user-space hasn't had time to run,
this array is still in pristine state on the process' stack; so no need to
copy it, we can just grab it from there.

In order to minimize the damage to audit_log_*() copy each string into a
temporary kernel buffer first.

Currently the audit code requires that the full argument vector fits in a
single packet.  So currently it does clip the argv size to a (sysctl) limit,
but only when execve auditing is enabled.

If the audit protocol gets extended to allow for multiple packets this check
can be removed.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ollie Wild <aaw@google.com>
Cc: <linux-audit@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  7 ++++
 fs/exec.c                          |  3 ++
 include/linux/binfmts.h            |  1 +
 kernel/auditsc.c                   | 84 ++++++++++++++++++++++++++++----------
 kernel/sysctl.c                    | 11 +++++
 5 files changed, 85 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index ebffdffb3d99..72e247ef6fa2 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1065,6 +1065,13 @@ check the amount of free space (value is in seconds). Default settings are: 4,
 resume it  if we have a value of 3 or more percent; consider information about
 the amount of free space valid for 30 seconds
 
+audit_argv_kb
+-------------
+
+The file contains a single value denoting the limit on the argv array size
+for execve (in KiB). This limit is only applied when system call auditing for
+execve is enabled, otherwise the value is ignored.
+
 ctrl-alt-del
 ------------
 
diff --git a/fs/exec.c b/fs/exec.c
index f20561ff4528..2e3f7950c185 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1154,6 +1154,7 @@ int do_execve(char * filename,
 {
 	struct linux_binprm *bprm;
 	struct file *file;
+	unsigned long env_p;
 	int retval;
 	int i;
 
@@ -1208,9 +1209,11 @@ int do_execve(char * filename,
 	if (retval < 0)
 		goto out;
 
+	env_p = bprm->p;
 	retval = copy_strings(bprm->argc, argv, bprm);
 	if (retval < 0)
 		goto out;
+	bprm->argv_len = env_p - bprm->p;
 
 	retval = search_binary_handler(bprm,regs);
 	if (retval >= 0) {
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index e1a708337be3..a0b209cd5761 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -40,6 +40,7 @@ struct linux_binprm{
 	unsigned interp_flags;
 	unsigned interp_data;
 	unsigned long loader, exec;
+	unsigned long argv_len;
 };
 
 #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b7640a5f382a..535586fc498b 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -153,7 +153,7 @@ struct audit_aux_data_execve {
 	struct audit_aux_data	d;
 	int argc;
 	int envc;
-	char mem[0];
+	struct mm_struct *mm;
 };
 
 struct audit_aux_data_socketcall {
@@ -831,6 +831,55 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
 	return rc;
 }
 
+static void audit_log_execve_info(struct audit_buffer *ab,
+		struct audit_aux_data_execve *axi)
+{
+	int i;
+	long len, ret;
+	const char __user *p = (const char __user *)axi->mm->arg_start;
+	char *buf;
+
+	if (axi->mm != current->mm)
+		return; /* execve failed, no additional info */
+
+	for (i = 0; i < axi->argc; i++, p += len) {
+		len = strnlen_user(p, MAX_ARG_PAGES*PAGE_SIZE);
+		/*
+		 * We just created this mm, if we can't find the strings
+		 * we just copied into it something is _very_ wrong. Similar
+		 * for strings that are too long, we should not have created
+		 * any.
+		 */
+		if (!len || len > MAX_ARG_STRLEN) {
+			WARN_ON(1);
+			send_sig(SIGKILL, current, 0);
+		}
+
+		buf = kmalloc(len, GFP_KERNEL);
+		if (!buf) {
+			audit_panic("out of memory for argv string\n");
+			break;
+		}
+
+		ret = copy_from_user(buf, p, len);
+		/*
+		 * There is no reason for this copy to be short. We just
+		 * copied them here, and the mm hasn't been exposed to user-
+		 * space yet.
+		 */
+		if (!ret) {
+			WARN_ON(1);
+			send_sig(SIGKILL, current, 0);
+		}
+
+		audit_log_format(ab, "a%d=", i);
+		audit_log_untrustedstring(ab, buf);
+		audit_log_format(ab, "\n");
+
+		kfree(buf);
+	}
+}
+
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
 	int i, call_panic = 0;
@@ -971,13 +1020,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 
 		case AUDIT_EXECVE: {
 			struct audit_aux_data_execve *axi = (void *)aux;
-			int i;
-			const char *p;
-			for (i = 0, p = axi->mem; i < axi->argc; i++) {
-				audit_log_format(ab, "a%d=", i);
-				p = audit_log_untrustedstring(ab, p);
-				audit_log_format(ab, "\n");
-			}
+			audit_log_execve_info(ab, axi);
 			break; }
 
 		case AUDIT_SOCKETCALL: {
@@ -1821,32 +1864,31 @@ int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode
 	return 0;
 }
 
+int audit_argv_kb = 32;
+
 int audit_bprm(struct linux_binprm *bprm)
 {
 	struct audit_aux_data_execve *ax;
 	struct audit_context *context = current->audit_context;
-	unsigned long p, next;
-	void *to;
 
 	if (likely(!audit_enabled || !context || context->dummy))
 		return 0;
 
-	ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p,
-				GFP_KERNEL);
+	/*
+	 * Even though the stack code doesn't limit the arg+env size any more,
+	 * the audit code requires that _all_ arguments be logged in a single
+	 * netlink skb. Hence cap it :-(
+	 */
+	if (bprm->argv_len > (audit_argv_kb << 10))
+		return -E2BIG;
+
+	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
 	if (!ax)
 		return -ENOMEM;
 
 	ax->argc = bprm->argc;
 	ax->envc = bprm->envc;
-	for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) {
-		struct page *page = bprm->page[p / PAGE_SIZE];
-		void *kaddr = kmap(page);
-		next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1);
-		memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p);
-		to += next - p;
-		kunmap(page);
-	}
-
+	ax->mm = bprm->mm;
 	ax->d.type = AUDIT_EXECVE;
 	ax->d.next = context->aux;
 	context->aux = (void *)ax;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3ed4912bf183..8db41764e2a1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -78,6 +78,7 @@ extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int maps_protect;
 extern int sysctl_stat_interval;
+extern int audit_argv_kb;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -306,6 +307,16 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#ifdef CONFIG_AUDITSYSCALL
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "audit_argv_kb",
+		.data		= &audit_argv_kb,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 	{
 		.ctl_name	= KERN_CORE_PATTERN,
 		.procname	= "core_pattern",
-- 
cgit v1.3-8-gc7d7


From b6a2fea39318e43fee84fa7b0b90d68bed92d2ba Mon Sep 17 00:00:00 2001
From: Ollie Wild <aaw@google.com>
Date: Thu, 19 Jul 2007 01:48:16 -0700
Subject: mm: variable length argument support

Remove the arg+env limit of MAX_ARG_PAGES by copying the strings directly from
the old mm into the new mm.

We create the new mm before the binfmt code runs, and place the new stack at
the very top of the address space.  Once the binfmt code runs and figures out
where the stack should be, we move it downwards.

It is a bit peculiar in that we have one task with two mm's, one of which is
inactive.

[a.p.zijlstra@chello.nl: limit stack size]
Signed-off-by: Ollie Wild <aaw@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <linux-arch@vger.kernel.org>
Cc: Hugh Dickins <hugh@veritas.com>
[bunk@stusta.de: unexport bprm_mm_init]
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/ia32/binfmt_elf32.c  |  67 ++---
 arch/x86_64/ia32/ia32_aout.c   |   2 +-
 arch/x86_64/ia32/ia32_binfmt.c |  58 ----
 fs/binfmt_elf.c                |  28 +-
 fs/binfmt_elf_fdpic.c          |   8 +-
 fs/binfmt_misc.c               |   4 +-
 fs/binfmt_script.c             |   4 +-
 fs/compat.c                    | 128 ++++-----
 fs/exec.c                      | 614 ++++++++++++++++++++++++++---------------
 include/linux/binfmts.h        |  18 +-
 include/linux/mm.h             |   9 +-
 kernel/auditsc.c               |   2 +-
 mm/mmap.c                      |  61 ++--
 mm/mprotect.c                  |   2 +-
 mm/mremap.c                    |   2 +-
 15 files changed, 554 insertions(+), 453 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/ia32/binfmt_elf32.c b/arch/ia64/ia32/binfmt_elf32.c
index 6f4d3d06f0ed..e1189ba1ca5e 100644
--- a/arch/ia64/ia32/binfmt_elf32.c
+++ b/arch/ia64/ia32/binfmt_elf32.c
@@ -195,62 +195,27 @@ ia64_elf32_init (struct pt_regs *regs)
 	ia32_load_state(current);
 }
 
+/*
+ * Undo the override of setup_arg_pages() without this ia32_setup_arg_pages()
+ * will suffer infinite self recursion.
+ */
+#undef setup_arg_pages
+
 int
 ia32_setup_arg_pages (struct linux_binprm *bprm, int executable_stack)
 {
-	unsigned long stack_base;
-	struct vm_area_struct *mpnt;
-	struct mm_struct *mm = current->mm;
-	int i, ret;
-
-	stack_base = IA32_STACK_TOP - MAX_ARG_PAGES*PAGE_SIZE;
-	mm->arg_start = bprm->p + stack_base;
-
-	bprm->p += stack_base;
-	if (bprm->loader)
-		bprm->loader += stack_base;
-	bprm->exec += stack_base;
-
-	mpnt = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
-	if (!mpnt)
-		return -ENOMEM;
-
-	down_write(&current->mm->mmap_sem);
-	{
-		mpnt->vm_mm = current->mm;
-		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
-		mpnt->vm_end = IA32_STACK_TOP;
-		if (executable_stack == EXSTACK_ENABLE_X)
-			mpnt->vm_flags = VM_STACK_FLAGS |  VM_EXEC;
-		else if (executable_stack == EXSTACK_DISABLE_X)
-			mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC;
-		else
-			mpnt->vm_flags = VM_STACK_FLAGS;
-		mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC)?
-					PAGE_COPY_EXEC: PAGE_COPY;
-		if ((ret = insert_vm_struct(current->mm, mpnt))) {
-			up_write(&current->mm->mmap_sem);
-			kmem_cache_free(vm_area_cachep, mpnt);
-			return ret;
-		}
-		current->mm->stack_vm = current->mm->total_vm = vma_pages(mpnt);
+	int ret;
+
+	ret = setup_arg_pages(bprm, IA32_STACK_TOP, executable_stack);
+	if (!ret) {
+		/*
+		 * Can't do it in ia64_elf32_init(). Needs to be done before
+		 * calls to elf32_map()
+		 */
+		current->thread.ppl = ia32_init_pp_list();
 	}
 
-	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
-		struct page *page = bprm->page[i];
-		if (page) {
-			bprm->page[i] = NULL;
-			install_arg_page(mpnt, page, stack_base);
-		}
-		stack_base += PAGE_SIZE;
-	}
-	up_write(&current->mm->mmap_sem);
-
-	/* Can't do it in ia64_elf32_init(). Needs to be done before calls to
-	   elf32_map() */
-	current->thread.ppl = ia32_init_pp_list();
-
-	return 0;
+	return ret;
 }
 
 static void
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index fe83edb93c10..08781370256d 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -404,7 +404,7 @@ beyond_if:
 
 	set_brk(current->mm->start_brk, current->mm->brk);
 
-	retval = ia32_setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
+	retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
 	if (retval < 0) { 
 		/* Someone check-me: is this error path enough? */ 
 		send_sig(SIGKILL, current, 0); 
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
index 185399baaf6d..ed56a8806eab 100644
--- a/arch/x86_64/ia32/ia32_binfmt.c
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -232,9 +232,6 @@ do {							\
 #define load_elf_binary load_elf32_binary
 
 #define ELF_PLAT_INIT(r, load_addr)	elf32_init(r)
-#define setup_arg_pages(bprm, stack_top, exec_stack) \
-	ia32_setup_arg_pages(bprm, stack_top, exec_stack)
-int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack);
 
 #undef start_thread
 #define start_thread(regs,new_rip,new_rsp) do { \
@@ -286,61 +283,6 @@ static void elf32_init(struct pt_regs *regs)
 	me->thread.es = __USER_DS;
 }
 
-int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top,
-			 int executable_stack)
-{
-	unsigned long stack_base;
-	struct vm_area_struct *mpnt;
-	struct mm_struct *mm = current->mm;
-	int i, ret;
-
-	stack_base = stack_top - MAX_ARG_PAGES * PAGE_SIZE;
-	mm->arg_start = bprm->p + stack_base;
-
-	bprm->p += stack_base;
-	if (bprm->loader)
-		bprm->loader += stack_base;
-	bprm->exec += stack_base;
-
-	mpnt = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
-	if (!mpnt) 
-		return -ENOMEM; 
-
-	down_write(&mm->mmap_sem);
-	{
-		mpnt->vm_mm = mm;
-		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
-		mpnt->vm_end = stack_top;
-		if (executable_stack == EXSTACK_ENABLE_X)
-			mpnt->vm_flags = VM_STACK_FLAGS |  VM_EXEC;
-		else if (executable_stack == EXSTACK_DISABLE_X)
-			mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC;
-		else
-			mpnt->vm_flags = VM_STACK_FLAGS;
- 		mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ? 
- 			PAGE_COPY_EXEC : PAGE_COPY;
-		if ((ret = insert_vm_struct(mm, mpnt))) {
-			up_write(&mm->mmap_sem);
-			kmem_cache_free(vm_area_cachep, mpnt);
-			return ret;
-		}
-		mm->stack_vm = mm->total_vm = vma_pages(mpnt);
-	} 
-
-	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
-		struct page *page = bprm->page[i];
-		if (page) {
-			bprm->page[i] = NULL;
-			install_arg_page(mpnt, page, stack_base);
-		}
-		stack_base += PAGE_SIZE;
-	}
-	up_write(&mm->mmap_sem);
-	
-	return 0;
-}
-EXPORT_SYMBOL(ia32_setup_arg_pages);
-
 #ifdef CONFIG_SYSCTL
 /* Register vsyscall32 into the ABI table */
 #include <linux/sysctl.h>
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index a27e42bf3400..295cbaa0e58a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -148,6 +148,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	elf_addr_t *elf_info;
 	int ei_index = 0;
 	struct task_struct *tsk = current;
+	struct vm_area_struct *vma;
 
 	/*
 	 * If this architecture has a platform capability string, copy it
@@ -234,6 +235,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	sp = (elf_addr_t __user *)bprm->p;
 #endif
 
+
+	/*
+	 * Grow the stack manually; some architectures have a limit on how
+	 * far ahead a user-space access may be in order to grow the stack.
+	 */
+	vma = find_extend_vma(current->mm, bprm->p);
+	if (!vma)
+		return -EFAULT;
+
 	/* Now, let's put argc (and argv, envp if appropriate) on the stack */
 	if (__put_user(argc, sp++))
 		return -EFAULT;
@@ -254,8 +264,8 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 		size_t len;
 		if (__put_user((elf_addr_t)p, argv++))
 			return -EFAULT;
-		len = strnlen_user((void __user *)p, PAGE_SIZE*MAX_ARG_PAGES);
-		if (!len || len > PAGE_SIZE*MAX_ARG_PAGES)
+		len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
+		if (!len || len > MAX_ARG_STRLEN)
 			return 0;
 		p += len;
 	}
@@ -266,8 +276,8 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 		size_t len;
 		if (__put_user((elf_addr_t)p, envp++))
 			return -EFAULT;
-		len = strnlen_user((void __user *)p, PAGE_SIZE*MAX_ARG_PAGES);
-		if (!len || len > PAGE_SIZE*MAX_ARG_PAGES)
+		len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
+		if (!len || len > MAX_ARG_STRLEN)
 			return 0;
 		p += len;
 	}
@@ -826,10 +836,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	}
 
 	/* OK, This is the point of no return */
-	current->mm->start_data = 0;
-	current->mm->end_data = 0;
-	current->mm->end_code = 0;
-	current->mm->mmap = NULL;
 	current->flags &= ~PF_FORKNOEXEC;
 	current->mm->def_flags = def_flags;
 
@@ -1051,9 +1057,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 
 	compute_creds(bprm);
 	current->flags &= ~PF_FORKNOEXEC;
-	create_elf_tables(bprm, &loc->elf_ex,
+	retval = create_elf_tables(bprm, &loc->elf_ex,
 			  (interpreter_type == INTERPRETER_AOUT),
 			  load_addr, interp_load_addr);
+	if (retval < 0) {
+		send_sig(SIGKILL, current, 0);
+		goto out;
+	}
 	/* N.B. passed_fileno might not be initialized? */
 	if (interpreter_type == INTERPRETER_AOUT)
 		current->mm->arg_start += strlen(passed_fileno) + 1;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 9d62fbad3d4b..4739506fb083 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -621,8 +621,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	p = (char __user *) current->mm->arg_start;
 	for (loop = bprm->argc; loop > 0; loop--) {
 		__put_user((elf_caddr_t) p, argv++);
-		len = strnlen_user(p, PAGE_SIZE * MAX_ARG_PAGES);
-		if (!len || len > PAGE_SIZE * MAX_ARG_PAGES)
+		len = strnlen_user(p, MAX_ARG_STRLEN);
+		if (!len || len > MAX_ARG_STRLEN)
 			return -EINVAL;
 		p += len;
 	}
@@ -633,8 +633,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	current->mm->env_start = (unsigned long) p;
 	for (loop = bprm->envc; loop > 0; loop--) {
 		__put_user((elf_caddr_t)(unsigned long) p, envp++);
-		len = strnlen_user(p, PAGE_SIZE * MAX_ARG_PAGES);
-		if (!len || len > PAGE_SIZE * MAX_ARG_PAGES)
+		len = strnlen_user(p, MAX_ARG_STRLEN);
+		if (!len || len > MAX_ARG_STRLEN)
 			return -EINVAL;
 		p += len;
 	}
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 330fd3fe8546..42e94b3ab7be 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -126,7 +126,9 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		goto _ret;
 
 	if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) {
-		remove_arg_zero(bprm);
+		retval = remove_arg_zero(bprm);
+		if (retval)
+			goto _ret;
 	}
 
 	if (fmt->flags & MISC_FMT_OPEN_BINARY) {
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 304c88544d89..4d0e0f6d3273 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -67,7 +67,9 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
 	 * This is done in reverse order, because of how the
 	 * user environment and arguments are stored.
 	 */
-	remove_arg_zero(bprm);
+	retval = remove_arg_zero(bprm);
+	if (retval)
+		return retval;
 	retval = copy_strings_kernel(1, &bprm->interp, bprm);
 	if (retval < 0) return retval; 
 	bprm->argc++;
diff --git a/fs/compat.c b/fs/compat.c
index 4db6216e5266..15078ce4c04a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1257,6 +1257,7 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
 {
 	struct page *kmapped_page = NULL;
 	char *kaddr = NULL;
+	unsigned long kpos = 0;
 	int ret;
 
 	while (argc-- > 0) {
@@ -1265,92 +1266,84 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
 		unsigned long pos;
 
 		if (get_user(str, argv+argc) ||
-			!(len = strnlen_user(compat_ptr(str), bprm->p))) {
+		    !(len = strnlen_user(compat_ptr(str), MAX_ARG_STRLEN))) {
 			ret = -EFAULT;
 			goto out;
 		}
 
-		if (bprm->p < len)  {
+		if (len > MAX_ARG_STRLEN) {
 			ret = -E2BIG;
 			goto out;
 		}
 
-		bprm->p -= len;
-		/* XXX: add architecture specific overflow check here. */
+		/* We're going to work our way backwords. */
 		pos = bprm->p;
+		str += len;
+		bprm->p -= len;
 
 		while (len > 0) {
-			int i, new, err;
 			int offset, bytes_to_copy;
-			struct page *page;
 
 			offset = pos % PAGE_SIZE;
-			i = pos/PAGE_SIZE;
-			page = bprm->page[i];
-			new = 0;
-			if (!page) {
-				page = alloc_page(GFP_HIGHUSER);
-				bprm->page[i] = page;
-				if (!page) {
-					ret = -ENOMEM;
+			if (offset == 0)
+				offset = PAGE_SIZE;
+
+			bytes_to_copy = offset;
+			if (bytes_to_copy > len)
+				bytes_to_copy = len;
+
+			offset -= bytes_to_copy;
+			pos -= bytes_to_copy;
+			str -= bytes_to_copy;
+			len -= bytes_to_copy;
+
+			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
+				struct page *page;
+
+#ifdef CONFIG_STACK_GROWSUP
+				ret = expand_stack_downwards(bprm->vma, pos);
+				if (ret < 0) {
+					/* We've exceed the stack rlimit. */
+					ret = -E2BIG;
+					goto out;
+				}
+#endif
+				ret = get_user_pages(current, bprm->mm, pos,
+						     1, 1, 1, &page, NULL);
+				if (ret <= 0) {
+					/* We've exceed the stack rlimit. */
+					ret = -E2BIG;
 					goto out;
 				}
-				new = 1;
-			}
 
-			if (page != kmapped_page) {
-				if (kmapped_page)
+				if (kmapped_page) {
+					flush_kernel_dcache_page(kmapped_page);
 					kunmap(kmapped_page);
+					put_page(kmapped_page);
+				}
 				kmapped_page = page;
 				kaddr = kmap(kmapped_page);
+				kpos = pos & PAGE_MASK;
+				flush_cache_page(bprm->vma, kpos,
+						 page_to_pfn(kmapped_page));
 			}
-			if (new && offset)
-				memset(kaddr, 0, offset);
-			bytes_to_copy = PAGE_SIZE - offset;
-			if (bytes_to_copy > len) {
-				bytes_to_copy = len;
-				if (new)
-					memset(kaddr+offset+len, 0,
-						PAGE_SIZE-offset-len);
-			}
-			err = copy_from_user(kaddr+offset, compat_ptr(str),
-						bytes_to_copy);
-			if (err) {
+			if (copy_from_user(kaddr+offset, compat_ptr(str),
+						bytes_to_copy)) {
 				ret = -EFAULT;
 				goto out;
 			}
-
-			pos += bytes_to_copy;
-			str += bytes_to_copy;
-			len -= bytes_to_copy;
 		}
 	}
 	ret = 0;
 out:
-	if (kmapped_page)
+	if (kmapped_page) {
+		flush_kernel_dcache_page(kmapped_page);
 		kunmap(kmapped_page);
-	return ret;
-}
-
-#ifdef CONFIG_MMU
-
-#define free_arg_pages(bprm) do { } while (0)
-
-#else
-
-static inline void free_arg_pages(struct linux_binprm *bprm)
-{
-	int i;
-
-	for (i = 0; i < MAX_ARG_PAGES; i++) {
-		if (bprm->page[i])
-			__free_page(bprm->page[i]);
-		bprm->page[i] = NULL;
+		put_page(kmapped_page);
 	}
+	return ret;
 }
 
-#endif /* CONFIG_MMU */
-
 /*
  * compat_do_execve() is mostly a copy of do_execve(), with the exception
  * that it processes 32 bit argv and envp pointers.
@@ -1363,7 +1356,6 @@ int compat_do_execve(char * filename,
 	struct linux_binprm *bprm;
 	struct file *file;
 	int retval;
-	int i;
 
 	retval = -ENOMEM;
 	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
@@ -1377,24 +1369,19 @@ int compat_do_execve(char * filename,
 
 	sched_exec();
 
-	bprm->p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
 	bprm->file = file;
 	bprm->filename = filename;
 	bprm->interp = filename;
-	bprm->mm = mm_alloc();
-	retval = -ENOMEM;
-	if (!bprm->mm)
-		goto out_file;
 
-	retval = init_new_context(current, bprm->mm);
-	if (retval < 0)
-		goto out_mm;
+	retval = bprm_mm_init(bprm);
+	if (retval)
+		goto out_file;
 
-	bprm->argc = compat_count(argv, bprm->p / sizeof(compat_uptr_t));
+	bprm->argc = compat_count(argv, MAX_ARG_STRINGS);
 	if ((retval = bprm->argc) < 0)
 		goto out_mm;
 
-	bprm->envc = compat_count(envp, bprm->p / sizeof(compat_uptr_t));
+	bprm->envc = compat_count(envp, MAX_ARG_STRINGS);
 	if ((retval = bprm->envc) < 0)
 		goto out_mm;
 
@@ -1421,8 +1408,6 @@ int compat_do_execve(char * filename,
 
 	retval = search_binary_handler(bprm, regs);
 	if (retval >= 0) {
-		free_arg_pages(bprm);
-
 		/* execve success */
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
@@ -1431,19 +1416,12 @@ int compat_do_execve(char * filename,
 	}
 
 out:
-	/* Something went wrong, return the inode and free the argument pages*/
-	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
-		struct page * page = bprm->page[i];
-		if (page)
-			__free_page(page);
-	}
-
 	if (bprm->security)
 		security_bprm_free(bprm);
 
 out_mm:
 	if (bprm->mm)
-		mmdrop(bprm->mm);
+		mmput(bprm->mm);
 
 out_file:
 	if (bprm->file) {
diff --git a/fs/exec.c b/fs/exec.c
index 2e3f7950c185..498f2b3dca20 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -54,6 +54,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
+#include <asm/tlb.h>
 
 #ifdef CONFIG_KMOD
 #include <linux/kmod.h>
@@ -178,6 +179,207 @@ exit:
 	goto out;
 }
 
+#ifdef CONFIG_MMU
+
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+		int write)
+{
+	struct page *page;
+	int ret;
+
+#ifdef CONFIG_STACK_GROWSUP
+	if (write) {
+		ret = expand_stack_downwards(bprm->vma, pos);
+		if (ret < 0)
+			return NULL;
+	}
+#endif
+	ret = get_user_pages(current, bprm->mm, pos,
+			1, write, 1, &page, NULL);
+	if (ret <= 0)
+		return NULL;
+
+	if (write) {
+		struct rlimit *rlim = current->signal->rlim;
+		unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
+
+		/*
+		 * Limit to 1/4-th the stack size for the argv+env strings.
+		 * This ensures that:
+		 *  - the remaining binfmt code will not run out of stack space,
+		 *  - the program will have a reasonable amount of stack left
+		 *    to work from.
+		 */
+		if (size > rlim[RLIMIT_STACK].rlim_cur / 4) {
+			put_page(page);
+			return NULL;
+		}
+	}
+
+	return page;
+}
+
+static void put_arg_page(struct page *page)
+{
+	put_page(page);
+}
+
+static void free_arg_page(struct linux_binprm *bprm, int i)
+{
+}
+
+static void free_arg_pages(struct linux_binprm *bprm)
+{
+}
+
+static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
+		struct page *page)
+{
+	flush_cache_page(bprm->vma, pos, page_to_pfn(page));
+}
+
+static int __bprm_mm_init(struct linux_binprm *bprm)
+{
+	int err = -ENOMEM;
+	struct vm_area_struct *vma = NULL;
+	struct mm_struct *mm = bprm->mm;
+
+	bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	if (!vma)
+		goto err;
+
+	down_write(&mm->mmap_sem);
+	vma->vm_mm = mm;
+
+	/*
+	 * Place the stack at the largest stack address the architecture
+	 * supports. Later, we'll move this to an appropriate place. We don't
+	 * use STACK_TOP because that can depend on attributes which aren't
+	 * configured yet.
+	 */
+	vma->vm_end = STACK_TOP_MAX;
+	vma->vm_start = vma->vm_end - PAGE_SIZE;
+
+	vma->vm_flags = VM_STACK_FLAGS;
+	vma->vm_page_prot = protection_map[vma->vm_flags & 0x7];
+	err = insert_vm_struct(mm, vma);
+	if (err) {
+		up_write(&mm->mmap_sem);
+		goto err;
+	}
+
+	mm->stack_vm = mm->total_vm = 1;
+	up_write(&mm->mmap_sem);
+
+	bprm->p = vma->vm_end - sizeof(void *);
+
+	return 0;
+
+err:
+	if (vma) {
+		bprm->vma = NULL;
+		kmem_cache_free(vm_area_cachep, vma);
+	}
+
+	return err;
+}
+
+static bool valid_arg_len(struct linux_binprm *bprm, long len)
+{
+	return len <= MAX_ARG_STRLEN;
+}
+
+#else
+
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+		int write)
+{
+	struct page *page;
+
+	page = bprm->page[pos / PAGE_SIZE];
+	if (!page && write) {
+		page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
+		if (!page)
+			return NULL;
+		bprm->page[pos / PAGE_SIZE] = page;
+	}
+
+	return page;
+}
+
+static void put_arg_page(struct page *page)
+{
+}
+
+static void free_arg_page(struct linux_binprm *bprm, int i)
+{
+	if (bprm->page[i]) {
+		__free_page(bprm->page[i]);
+		bprm->page[i] = NULL;
+	}
+}
+
+static void free_arg_pages(struct linux_binprm *bprm)
+{
+	int i;
+
+	for (i = 0; i < MAX_ARG_PAGES; i++)
+		free_arg_page(bprm, i);
+}
+
+static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
+		struct page *page)
+{
+}
+
+static int __bprm_mm_init(struct linux_binprm *bprm)
+{
+	bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
+	return 0;
+}
+
+static bool valid_arg_len(struct linux_binprm *bprm, long len)
+{
+	return len <= bprm->p;
+}
+
+#endif /* CONFIG_MMU */
+
+/*
+ * Create a new mm_struct and populate it with a temporary stack
+ * vm_area_struct.  We don't have enough context at this point to set the stack
+ * flags, permissions, and offset, so we use temporary values.  We'll update
+ * them later in setup_arg_pages().
+ */
+int bprm_mm_init(struct linux_binprm *bprm)
+{
+	int err;
+	struct mm_struct *mm = NULL;
+
+	bprm->mm = mm = mm_alloc();
+	err = -ENOMEM;
+	if (!mm)
+		goto err;
+
+	err = init_new_context(current, mm);
+	if (err)
+		goto err;
+
+	err = __bprm_mm_init(bprm);
+	if (err)
+		goto err;
+
+	return 0;
+
+err:
+	if (mm) {
+		bprm->mm = NULL;
+		mmdrop(mm);
+	}
+
+	return err;
+}
+
 /*
  * count() counts the number of strings in array ARGV.
  */
@@ -203,15 +405,16 @@ static int count(char __user * __user * argv, int max)
 }
 
 /*
- * 'copy_strings()' copies argument/environment strings from user
- * memory to free pages in kernel mem. These are in a format ready
- * to be put directly into the top of new user memory.
+ * 'copy_strings()' copies argument/environment strings from the old
+ * processes's memory to the new process's stack.  The call to get_user_pages()
+ * ensures the destination page is created and not swapped out.
  */
 static int copy_strings(int argc, char __user * __user * argv,
 			struct linux_binprm *bprm)
 {
 	struct page *kmapped_page = NULL;
 	char *kaddr = NULL;
+	unsigned long kpos = 0;
 	int ret;
 
 	while (argc-- > 0) {
@@ -220,69 +423,69 @@ static int copy_strings(int argc, char __user * __user * argv,
 		unsigned long pos;
 
 		if (get_user(str, argv+argc) ||
-				!(len = strnlen_user(str, bprm->p))) {
+				!(len = strnlen_user(str, MAX_ARG_STRLEN))) {
 			ret = -EFAULT;
 			goto out;
 		}
 
-		if (bprm->p < len)  {
+		if (!valid_arg_len(bprm, len)) {
 			ret = -E2BIG;
 			goto out;
 		}
 
-		bprm->p -= len;
-		/* XXX: add architecture specific overflow check here. */
+		/* We're going to work our way backwords. */
 		pos = bprm->p;
+		str += len;
+		bprm->p -= len;
 
 		while (len > 0) {
-			int i, new, err;
 			int offset, bytes_to_copy;
-			struct page *page;
 
 			offset = pos % PAGE_SIZE;
-			i = pos/PAGE_SIZE;
-			page = bprm->page[i];
-			new = 0;
-			if (!page) {
-				page = alloc_page(GFP_HIGHUSER);
-				bprm->page[i] = page;
+			if (offset == 0)
+				offset = PAGE_SIZE;
+
+			bytes_to_copy = offset;
+			if (bytes_to_copy > len)
+				bytes_to_copy = len;
+
+			offset -= bytes_to_copy;
+			pos -= bytes_to_copy;
+			str -= bytes_to_copy;
+			len -= bytes_to_copy;
+
+			if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
+				struct page *page;
+
+				page = get_arg_page(bprm, pos, 1);
 				if (!page) {
-					ret = -ENOMEM;
+					ret = -E2BIG;
 					goto out;
 				}
-				new = 1;
-			}
 
-			if (page != kmapped_page) {
-				if (kmapped_page)
+				if (kmapped_page) {
+					flush_kernel_dcache_page(kmapped_page);
 					kunmap(kmapped_page);
+					put_arg_page(kmapped_page);
+				}
 				kmapped_page = page;
 				kaddr = kmap(kmapped_page);
+				kpos = pos & PAGE_MASK;
+				flush_arg_page(bprm, kpos, kmapped_page);
 			}
-			if (new && offset)
-				memset(kaddr, 0, offset);
-			bytes_to_copy = PAGE_SIZE - offset;
-			if (bytes_to_copy > len) {
-				bytes_to_copy = len;
-				if (new)
-					memset(kaddr+offset+len, 0,
-						PAGE_SIZE-offset-len);
-			}
-			err = copy_from_user(kaddr+offset, str, bytes_to_copy);
-			if (err) {
+			if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
 				ret = -EFAULT;
 				goto out;
 			}
-
-			pos += bytes_to_copy;
-			str += bytes_to_copy;
-			len -= bytes_to_copy;
 		}
 	}
 	ret = 0;
 out:
-	if (kmapped_page)
+	if (kmapped_page) {
+		flush_kernel_dcache_page(kmapped_page);
 		kunmap(kmapped_page);
+		put_arg_page(kmapped_page);
+	}
 	return ret;
 }
 
@@ -298,181 +501,172 @@ int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
 	set_fs(oldfs);
 	return r;
 }
-
 EXPORT_SYMBOL(copy_strings_kernel);
 
 #ifdef CONFIG_MMU
+
 /*
- * This routine is used to map in a page into an address space: needed by
- * execve() for the initial stack and environment pages.
+ * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
+ * the binfmt code determines where the new stack should reside, we shift it to
+ * its final location.  The process proceeds as follows:
  *
- * vma->vm_mm->mmap_sem is held for writing.
+ * 1) Use shift to calculate the new vma endpoints.
+ * 2) Extend vma to cover both the old and new ranges.  This ensures the
+ *    arguments passed to subsequent functions are consistent.
+ * 3) Move vma's page tables to the new range.
+ * 4) Free up any cleared pgd range.
+ * 5) Shrink the vma to cover only the new range.
  */
-void install_arg_page(struct vm_area_struct *vma,
-			struct page *page, unsigned long address)
+static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	pte_t * pte;
-	spinlock_t *ptl;
+	unsigned long old_start = vma->vm_start;
+	unsigned long old_end = vma->vm_end;
+	unsigned long length = old_end - old_start;
+	unsigned long new_start = old_start - shift;
+	unsigned long new_end = old_end - shift;
+	struct mmu_gather *tlb;
 
-	if (unlikely(anon_vma_prepare(vma)))
-		goto out;
+	BUG_ON(new_start > new_end);
 
-	flush_dcache_page(page);
-	pte = get_locked_pte(mm, address, &ptl);
-	if (!pte)
-		goto out;
-	if (!pte_none(*pte)) {
-		pte_unmap_unlock(pte, ptl);
-		goto out;
+	/*
+	 * ensure there are no vmas between where we want to go
+	 * and where we are
+	 */
+	if (vma != find_vma(mm, new_start))
+		return -EFAULT;
+
+	/*
+	 * cover the whole range: [new_start, old_end)
+	 */
+	vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL);
+
+	/*
+	 * move the page tables downwards, on failure we rely on
+	 * process cleanup to remove whatever mess we made.
+	 */
+	if (length != move_page_tables(vma, old_start,
+				       vma, new_start, length))
+		return -ENOMEM;
+
+	lru_add_drain();
+	tlb = tlb_gather_mmu(mm, 0);
+	if (new_end > old_start) {
+		/*
+		 * when the old and new regions overlap clear from new_end.
+		 */
+		free_pgd_range(&tlb, new_end, old_end, new_end,
+			vma->vm_next ? vma->vm_next->vm_start : 0);
+	} else {
+		/*
+		 * otherwise, clean from old_start; this is done to not touch
+		 * the address space in [new_end, old_start) some architectures
+		 * have constraints on va-space that make this illegal (IA64) -
+		 * for the others its just a little faster.
+		 */
+		free_pgd_range(&tlb, old_start, old_end, new_end,
+			vma->vm_next ? vma->vm_next->vm_start : 0);
 	}
-	inc_mm_counter(mm, anon_rss);
-	lru_cache_add_active(page);
-	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
-					page, vma->vm_page_prot))));
-	page_add_new_anon_rmap(page, vma, address);
-	pte_unmap_unlock(pte, ptl);
-
-	/* no need for flush_tlb */
-	return;
-out:
-	__free_page(page);
-	force_sig(SIGKILL, current);
+	tlb_finish_mmu(tlb, new_end, old_end);
+
+	/*
+	 * shrink the vma to just the new range.
+	 */
+	vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
+
+	return 0;
 }
 
 #define EXTRA_STACK_VM_PAGES	20	/* random */
 
+/*
+ * Finalizes the stack vm_area_struct. The flags and permissions are updated,
+ * the stack is optionally relocated, and some extra space is added.
+ */
 int setup_arg_pages(struct linux_binprm *bprm,
 		    unsigned long stack_top,
 		    int executable_stack)
 {
-	unsigned long stack_base;
-	struct vm_area_struct *mpnt;
+	unsigned long ret;
+	unsigned long stack_shift;
 	struct mm_struct *mm = current->mm;
-	int i, ret;
-	long arg_size;
+	struct vm_area_struct *vma = bprm->vma;
+	struct vm_area_struct *prev = NULL;
+	unsigned long vm_flags;
+	unsigned long stack_base;
 
 #ifdef CONFIG_STACK_GROWSUP
-	/* Move the argument and environment strings to the bottom of the
-	 * stack space.
-	 */
-	int offset, j;
-	char *to, *from;
-
-	/* Start by shifting all the pages down */
-	i = 0;
-	for (j = 0; j < MAX_ARG_PAGES; j++) {
-		struct page *page = bprm->page[j];
-		if (!page)
-			continue;
-		bprm->page[i++] = page;
-	}
-
-	/* Now move them within their pages */
-	offset = bprm->p % PAGE_SIZE;
-	to = kmap(bprm->page[0]);
-	for (j = 1; j < i; j++) {
-		memmove(to, to + offset, PAGE_SIZE - offset);
-		from = kmap(bprm->page[j]);
-		memcpy(to + PAGE_SIZE - offset, from, offset);
-		kunmap(bprm->page[j - 1]);
-		to = from;
-	}
-	memmove(to, to + offset, PAGE_SIZE - offset);
-	kunmap(bprm->page[j - 1]);
-
 	/* Limit stack size to 1GB */
 	stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max;
 	if (stack_base > (1 << 30))
 		stack_base = 1 << 30;
-	stack_base = PAGE_ALIGN(stack_top - stack_base);
 
-	/* Adjust bprm->p to point to the end of the strings. */
-	bprm->p = stack_base + PAGE_SIZE * i - offset;
+	/* Make sure we didn't let the argument array grow too large. */
+	if (vma->vm_end - vma->vm_start > stack_base)
+		return -ENOMEM;
 
-	mm->arg_start = stack_base;
-	arg_size = i << PAGE_SHIFT;
+	stack_base = PAGE_ALIGN(stack_top - stack_base);
 
-	/* zero pages that were copied above */
-	while (i < MAX_ARG_PAGES)
-		bprm->page[i++] = NULL;
+	stack_shift = vma->vm_start - stack_base;
+	mm->arg_start = bprm->p - stack_shift;
+	bprm->p = vma->vm_end - stack_shift;
 #else
-	stack_base = arch_align_stack(stack_top - MAX_ARG_PAGES*PAGE_SIZE);
-	stack_base = PAGE_ALIGN(stack_base);
-	bprm->p += stack_base;
+	stack_top = arch_align_stack(stack_top);
+	stack_top = PAGE_ALIGN(stack_top);
+	stack_shift = vma->vm_end - stack_top;
+
+	bprm->p -= stack_shift;
 	mm->arg_start = bprm->p;
-	arg_size = stack_top - (PAGE_MASK & (unsigned long) mm->arg_start);
 #endif
 
-	arg_size += EXTRA_STACK_VM_PAGES * PAGE_SIZE;
-
 	if (bprm->loader)
-		bprm->loader += stack_base;
-	bprm->exec += stack_base;
-
-	mpnt = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
-	if (!mpnt)
-		return -ENOMEM;
+		bprm->loader -= stack_shift;
+	bprm->exec -= stack_shift;
 
 	down_write(&mm->mmap_sem);
-	{
-		mpnt->vm_mm = mm;
-#ifdef CONFIG_STACK_GROWSUP
-		mpnt->vm_start = stack_base;
-		mpnt->vm_end = stack_base + arg_size;
-#else
-		mpnt->vm_end = stack_top;
-		mpnt->vm_start = mpnt->vm_end - arg_size;
-#endif
-		/* Adjust stack execute permissions; explicitly enable
-		 * for EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X
-		 * and leave alone (arch default) otherwise. */
-		if (unlikely(executable_stack == EXSTACK_ENABLE_X))
-			mpnt->vm_flags = VM_STACK_FLAGS |  VM_EXEC;
-		else if (executable_stack == EXSTACK_DISABLE_X)
-			mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC;
-		else
-			mpnt->vm_flags = VM_STACK_FLAGS;
-		mpnt->vm_flags |= mm->def_flags;
-		mpnt->vm_page_prot = protection_map[mpnt->vm_flags & 0x7];
-		if ((ret = insert_vm_struct(mm, mpnt))) {
+	vm_flags = vma->vm_flags;
+
+	/*
+	 * Adjust stack execute permissions; explicitly enable for
+	 * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
+	 * (arch default) otherwise.
+	 */
+	if (unlikely(executable_stack == EXSTACK_ENABLE_X))
+		vm_flags |= VM_EXEC;
+	else if (executable_stack == EXSTACK_DISABLE_X)
+		vm_flags &= ~VM_EXEC;
+	vm_flags |= mm->def_flags;
+
+	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
+			vm_flags);
+	if (ret)
+		goto out_unlock;
+	BUG_ON(prev != vma);
+
+	/* Move stack pages down in memory. */
+	if (stack_shift) {
+		ret = shift_arg_pages(vma, stack_shift);
+		if (ret) {
 			up_write(&mm->mmap_sem);
-			kmem_cache_free(vm_area_cachep, mpnt);
 			return ret;
 		}
-		mm->stack_vm = mm->total_vm = vma_pages(mpnt);
 	}
 
-	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
-		struct page *page = bprm->page[i];
-		if (page) {
-			bprm->page[i] = NULL;
-			install_arg_page(mpnt, page, stack_base);
-		}
-		stack_base += PAGE_SIZE;
-	}
+#ifdef CONFIG_STACK_GROWSUP
+	stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+#else
+	stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+#endif
+	ret = expand_stack(vma, stack_base);
+	if (ret)
+		ret = -EFAULT;
+
+out_unlock:
 	up_write(&mm->mmap_sem);
-	
 	return 0;
 }
-
 EXPORT_SYMBOL(setup_arg_pages);
 
-#define free_arg_pages(bprm) do { } while (0)
-
-#else
-
-static inline void free_arg_pages(struct linux_binprm *bprm)
-{
-	int i;
-
-	for (i = 0; i < MAX_ARG_PAGES; i++) {
-		if (bprm->page[i])
-			__free_page(bprm->page[i]);
-		bprm->page[i] = NULL;
-	}
-}
-
 #endif /* CONFIG_MMU */
 
 struct file *open_exec(const char *name)
@@ -1000,43 +1194,42 @@ EXPORT_SYMBOL(compute_creds);
  * points to; chop off the first by relocating brpm->p to right after
  * the first '\0' encountered.
  */
-void remove_arg_zero(struct linux_binprm *bprm)
+int remove_arg_zero(struct linux_binprm *bprm)
 {
-	if (bprm->argc) {
-		char ch;
+	int ret = 0;
+	unsigned long offset;
+	char *kaddr;
+	struct page *page;
 
-		do {
-			unsigned long offset;
-			unsigned long index;
-			char *kaddr;
-			struct page *page;
-
-			offset = bprm->p & ~PAGE_MASK;
-			index = bprm->p >> PAGE_SHIFT;
+	if (!bprm->argc)
+		return 0;
 
-			page = bprm->page[index];
-			kaddr = kmap_atomic(page, KM_USER0);
+	do {
+		offset = bprm->p & ~PAGE_MASK;
+		page = get_arg_page(bprm, bprm->p, 0);
+		if (!page) {
+			ret = -EFAULT;
+			goto out;
+		}
+		kaddr = kmap_atomic(page, KM_USER0);
 
-			/* run through page until we reach end or find NUL */
-			do {
-				ch = *(kaddr + offset);
+		for (; offset < PAGE_SIZE && kaddr[offset];
+				offset++, bprm->p++)
+			;
 
-				/* discard that character... */
-				bprm->p++;
-				offset++;
-			} while (offset < PAGE_SIZE && ch != '\0');
+		kunmap_atomic(kaddr, KM_USER0);
+		put_arg_page(page);
 
-			kunmap_atomic(kaddr, KM_USER0);
+		if (offset == PAGE_SIZE)
+			free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
+	} while (offset == PAGE_SIZE);
 
-			/* free the old page */
-			if (offset == PAGE_SIZE) {
-				__free_page(page);
-				bprm->page[index] = NULL;
-			}
-		} while (ch != '\0');
+	bprm->p++;
+	bprm->argc--;
+	ret = 0;
 
-		bprm->argc--;
-	}
+out:
+	return ret;
 }
 EXPORT_SYMBOL(remove_arg_zero);
 
@@ -1062,7 +1255,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 		fput(bprm->file);
 		bprm->file = NULL;
 
-	        loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
+		loader = bprm->vma->vm_end - sizeof(void *);
 
 		file = open_exec("/sbin/loader");
 		retval = PTR_ERR(file);
@@ -1156,7 +1349,6 @@ int do_execve(char * filename,
 	struct file *file;
 	unsigned long env_p;
 	int retval;
-	int i;
 
 	retval = -ENOMEM;
 	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
@@ -1170,25 +1362,19 @@ int do_execve(char * filename,
 
 	sched_exec();
 
-	bprm->p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
-
 	bprm->file = file;
 	bprm->filename = filename;
 	bprm->interp = filename;
-	bprm->mm = mm_alloc();
-	retval = -ENOMEM;
-	if (!bprm->mm)
-		goto out_file;
 
-	retval = init_new_context(current, bprm->mm);
-	if (retval < 0)
-		goto out_mm;
+	retval = bprm_mm_init(bprm);
+	if (retval)
+		goto out_file;
 
-	bprm->argc = count(argv, bprm->p / sizeof(void *));
+	bprm->argc = count(argv, MAX_ARG_STRINGS);
 	if ((retval = bprm->argc) < 0)
 		goto out_mm;
 
-	bprm->envc = count(envp, bprm->p / sizeof(void *));
+	bprm->envc = count(envp, MAX_ARG_STRINGS);
 	if ((retval = bprm->envc) < 0)
 		goto out_mm;
 
@@ -1217,9 +1403,8 @@ int do_execve(char * filename,
 
 	retval = search_binary_handler(bprm,regs);
 	if (retval >= 0) {
-		free_arg_pages(bprm);
-
 		/* execve success */
+		free_arg_pages(bprm);
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
 		kfree(bprm);
@@ -1227,26 +1412,19 @@ int do_execve(char * filename,
 	}
 
 out:
-	/* Something went wrong, return the inode and free the argument pages*/
-	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
-		struct page * page = bprm->page[i];
-		if (page)
-			__free_page(page);
-	}
-
+	free_arg_pages(bprm);
 	if (bprm->security)
 		security_bprm_free(bprm);
 
 out_mm:
 	if (bprm->mm)
-		mmdrop(bprm->mm);
+		mmput (bprm->mm);
 
 out_file:
 	if (bprm->file) {
 		allow_write_access(bprm->file);
 		fput(bprm->file);
 	}
-
 out_kfree:
 	kfree(bprm);
 
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index a0b209cd5761..91c8c07fe8b7 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -6,11 +6,13 @@
 struct pt_regs;
 
 /*
- * MAX_ARG_PAGES defines the number of pages allocated for arguments
- * and envelope for the new program. 32 should suffice, this gives
- * a maximum env+arg of 128kB w/4KB pages!
+ * These are the maximum length and maximum number of strings passed to the
+ * execve() system call.  MAX_ARG_STRLEN is essentially random but serves to
+ * prevent the kernel from being unduly impacted by misaddressed pointers.
+ * MAX_ARG_STRINGS is chosen to fit in a signed 32-bit integer.
  */
-#define MAX_ARG_PAGES 32
+#define MAX_ARG_STRLEN (PAGE_SIZE * 32)
+#define MAX_ARG_STRINGS 0x7FFFFFFF
 
 /* sizeof(linux_binprm->buf) */
 #define BINPRM_BUF_SIZE 128
@@ -24,7 +26,12 @@ struct pt_regs;
  */
 struct linux_binprm{
 	char buf[BINPRM_BUF_SIZE];
+#ifdef CONFIG_MMU
+	struct vm_area_struct *vma;
+#else
+# define MAX_ARG_PAGES	32
 	struct page *page[MAX_ARG_PAGES];
+#endif
 	struct mm_struct *mm;
 	unsigned long p; /* current top of mem */
 	int sh_bang;
@@ -69,7 +76,7 @@ extern int register_binfmt(struct linux_binfmt *);
 extern int unregister_binfmt(struct linux_binfmt *);
 
 extern int prepare_binprm(struct linux_binprm *);
-extern void remove_arg_zero(struct linux_binprm *);
+extern int __must_check remove_arg_zero(struct linux_binprm *);
 extern int search_binary_handler(struct linux_binprm *,struct pt_regs *);
 extern int flush_old_exec(struct linux_binprm * bprm);
 
@@ -86,6 +93,7 @@ extern int suid_dumpable;
 extern int setup_arg_pages(struct linux_binprm * bprm,
 			   unsigned long stack_top,
 			   int executable_stack);
+extern int bprm_mm_init(struct linux_binprm *bprm);
 extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm);
 extern void compute_creds(struct linux_binprm *binprm);
 extern int do_coredump(long signr, int exit_code, struct pt_regs * regs);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 50a0ed1d1806..c456c3a1c28e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -808,7 +808,6 @@ static inline int handle_mm_fault(struct mm_struct *mm,
 
 extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
-void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
 		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
@@ -825,9 +824,15 @@ int FASTCALL(set_page_dirty(struct page *page));
 int set_page_dirty_lock(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
 
+extern unsigned long move_page_tables(struct vm_area_struct *vma,
+		unsigned long old_addr, struct vm_area_struct *new_vma,
+		unsigned long new_addr, unsigned long len);
 extern unsigned long do_mremap(unsigned long addr,
 			       unsigned long old_len, unsigned long new_len,
 			       unsigned long flags, unsigned long new_addr);
+extern int mprotect_fixup(struct vm_area_struct *vma,
+			  struct vm_area_struct **pprev, unsigned long start,
+			  unsigned long end, unsigned long newflags);
 
 /*
  * A callback you can register to apply pressure to ageable caches.
@@ -1159,6 +1164,8 @@ extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
 #ifdef CONFIG_IA64
 extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
 #endif
+extern int expand_stack_downwards(struct vm_area_struct *vma,
+				  unsigned long address);
 
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 535586fc498b..145cbb79c4b9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -843,7 +843,7 @@ static void audit_log_execve_info(struct audit_buffer *ab,
 		return; /* execve failed, no additional info */
 
 	for (i = 0; i < axi->argc; i++, p += len) {
-		len = strnlen_user(p, MAX_ARG_PAGES*PAGE_SIZE);
+		len = strnlen_user(p, MAX_ARG_STRLEN);
 		/*
 		 * We just created this mm, if we can't find the strings
 		 * we just copied into it something is _very_ wrong. Similar
diff --git a/mm/mmap.c b/mm/mmap.c
index 724f342bcf89..7afc7a7cec6f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1571,33 +1571,11 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
 
-#ifdef CONFIG_STACK_GROWSUP
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
-{
-	return expand_upwards(vma, address);
-}
-
-struct vm_area_struct *
-find_extend_vma(struct mm_struct *mm, unsigned long addr)
-{
-	struct vm_area_struct *vma, *prev;
-
-	addr &= PAGE_MASK;
-	vma = find_vma_prev(mm, addr, &prev);
-	if (vma && (vma->vm_start <= addr))
-		return vma;
-	if (!prev || expand_stack(prev, addr))
-		return NULL;
-	if (prev->vm_flags & VM_LOCKED) {
-		make_pages_present(addr, prev->vm_end);
-	}
-	return prev;
-}
-#else
 /*
  * vma is the first one with address < vma->vm_start.  Have to extend vma.
  */
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
+static inline int expand_downwards(struct vm_area_struct *vma,
+				   unsigned long address)
 {
 	int error;
 
@@ -1634,6 +1612,38 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address)
 	return error;
 }
 
+int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address)
+{
+	return expand_downwards(vma, address);
+}
+
+#ifdef CONFIG_STACK_GROWSUP
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+	return expand_upwards(vma, address);
+}
+
+struct vm_area_struct *
+find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+	struct vm_area_struct *vma, *prev;
+
+	addr &= PAGE_MASK;
+	vma = find_vma_prev(mm, addr, &prev);
+	if (vma && (vma->vm_start <= addr))
+		return vma;
+	if (!prev || expand_stack(prev, addr))
+		return NULL;
+	if (prev->vm_flags & VM_LOCKED)
+		make_pages_present(addr, prev->vm_end);
+	return prev;
+}
+#else
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+	return expand_downwards(vma, address);
+}
+
 struct vm_area_struct *
 find_extend_vma(struct mm_struct * mm, unsigned long addr)
 {
@@ -1651,9 +1661,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
 	start = vma->vm_start;
 	if (expand_stack(vma, addr))
 		return NULL;
-	if (vma->vm_flags & VM_LOCKED) {
+	if (vma->vm_flags & VM_LOCKED)
 		make_pages_present(addr, start);
-	}
 	return vma;
 }
 #endif
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 3b8f3c0c63f3..e8346c30abec 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -128,7 +128,7 @@ static void change_protection(struct vm_area_struct *vma,
 	flush_tlb_range(vma, start, end);
 }
 
-static int
+int
 mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	unsigned long start, unsigned long end, unsigned long newflags)
 {
diff --git a/mm/mremap.c b/mm/mremap.c
index bc7c52efc71b..8ea5c2412c6e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -120,7 +120,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 
 #define LATENCY_LIMIT	(64 * PAGE_SIZE)
 
-static unsigned long move_page_tables(struct vm_area_struct *vma,
+unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
 		unsigned long new_addr, unsigned long len)
 {
-- 
cgit v1.3-8-gc7d7


From 16f1820028d660d9da9c03b2ae7e98253c11795b Mon Sep 17 00:00:00 2001
From: Josef 'Jeff' Sipek <jsipek@cs.sunysb.edu>
Date: Thu, 19 Jul 2007 01:48:18 -0700
Subject: fs: introduce vfs_path_lookup

Stackable file systems, among others, frequently need to lookup paths or
path components starting from an arbitrary point in the namespace
(identified by a dentry and a vfsmount).  Currently, such file systems use
lookup_one_len, which is frowned upon [1] as it does not pass the lookup
intent along; not passing a lookup intent, for example, can trigger BUG_ON's
when stacking on top of NFSv4.

The first patch introduces a new lookup function to allow lookup starting
from an arbitrary point in the namespace.  This approach has been suggested
by Christoph Hellwig [2].

The second patch changes sunrpc to use vfs_path_lookup.

The third patch changes nfsctl.c to use vfs_path_lookup.

The fourth patch marks link_path_walk static.

The fifth, and last patch, unexports path_walk because it is no longer
unnecessary to call it directly, and using the new vfs_path_lookup is
cleaner.

For example, the following snippet of code, looks up "some/path/component"
in a directory pointed to by parent_{dentry,vfsmnt}:

err = vfs_path_lookup(parent_dentry, parent_vfsmnt,
		      "some/path/component", 0, &nd);
if (!err) {
	/* exits */

	...

	/* once done, release the references */
	path_release(&nd);
} else if (err == -ENOENT) {
	/* doesn't exist */
} else {
	/* other error */
}

VFS functions such as lookup_create can be used on the nameidata structure
to pass the create intent to the file system.

Signed-off-by: Josef 'Jeff' Sipek <jsipek@cs.sunysb.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@suse.de>
Cc: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c            | 32 ++++++++++++++++++++++++++++++++
 include/linux/namei.h |  2 ++
 2 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index defaa47c11d4..3bdb29615a9d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1172,6 +1172,37 @@ int fastcall path_lookup(const char *name, unsigned int flags,
 	return do_path_lookup(AT_FDCWD, name, flags, nd);
 }
 
+/**
+ * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
+ * @dentry:  pointer to dentry of the base directory
+ * @mnt: pointer to vfs mount of the base directory
+ * @name: pointer to file name
+ * @flags: lookup flags
+ * @nd: pointer to nameidata
+ */
+int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
+		    const char *name, unsigned int flags,
+		    struct nameidata *nd)
+{
+	int retval;
+
+	/* same as do_path_lookup */
+	nd->last_type = LAST_ROOT;
+	nd->flags = flags;
+	nd->depth = 0;
+
+	nd->mnt = mntget(mnt);
+	nd->dentry = dget(dentry);
+
+	retval = path_walk(name, nd);
+	if (unlikely(!retval && !audit_dummy_context() && nd->dentry &&
+				nd->dentry->d_inode))
+		audit_inode(name, nd->dentry->d_inode);
+
+	return retval;
+
+}
+
 static int __path_lookup_intent_open(int dfd, const char *name,
 		unsigned int lookup_flags, struct nameidata *nd,
 		int open_flags, int create_mode)
@@ -2774,6 +2805,7 @@ EXPORT_SYMBOL(__page_symlink);
 EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
 EXPORT_SYMBOL(path_lookup);
+EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(path_release);
 EXPORT_SYMBOL(path_walk);
 EXPORT_SYMBOL(permission);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index b7dd24917f0d..2e21af0989d9 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -69,6 +69,8 @@ extern int FASTCALL(__user_walk_fd(int dfd, const char __user *, unsigned, struc
 #define user_path_walk_link(name,nd) \
 	__user_walk_fd(AT_FDCWD, name, 0, nd)
 extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *));
+extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
+			   const char *, unsigned int, struct nameidata *);
 extern int FASTCALL(path_walk(const char *, struct nameidata *));
 extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
 extern void path_release(struct nameidata *);
-- 
cgit v1.3-8-gc7d7


From c4a7808fc3d7a346d5d12e0d69d76d66d821488b Mon Sep 17 00:00:00 2001
From: Josef 'Jeff' Sipek <jsipek@cs.sunysb.edu>
Date: Thu, 19 Jul 2007 01:48:22 -0700
Subject: fs: mark link_path_walk static

Signed-off-by: Josef 'Jeff' Sipek <jsipek@cs.sunysb.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@suse.de>
Cc: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c            | 4 +++-
 include/linux/namei.h | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 3bdb29615a9d..b9fdda8c0930 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -107,6 +107,8 @@
  * any extra contention...
  */
 
+static int fastcall link_path_walk(const char *name, struct nameidata *nd);
+
 /* In order to reduce some races, while at the same time doing additional
  * checking and hopefully speeding things up, we copy filenames to the
  * kernel data space before using them..
@@ -998,7 +1000,7 @@ return_err:
  * Retry the whole path once, forcing real lookup requests
  * instead of relying on the dcache.
  */
-int fastcall link_path_walk(const char *name, struct nameidata *nd)
+static int fastcall link_path_walk(const char *name, struct nameidata *nd)
 {
 	struct nameidata save = *nd;
 	int result;
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 2e21af0989d9..18ea81265068 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -72,7 +72,6 @@ extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *));
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct nameidata *);
 extern int FASTCALL(path_walk(const char *, struct nameidata *));
-extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
 extern void path_release(struct nameidata *);
 extern void path_release_on_umount(struct nameidata *);
 
-- 
cgit v1.3-8-gc7d7


From f79c20f52532d38fd0aee7ef64e138cc1613c484 Mon Sep 17 00:00:00 2001
From: Josef 'Jeff' Sipek <jsipek@cs.sunysb.edu>
Date: Thu, 19 Jul 2007 01:48:22 -0700
Subject: fs: remove path_walk export

Signed-off-by: Josef 'Jeff' Sipek <jsipek@cs.sunysb.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@suse.de>
Cc: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c            | 3 +--
 include/linux/namei.h | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index b9fdda8c0930..a83160acd748 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1024,7 +1024,7 @@ static int fastcall link_path_walk(const char *name, struct nameidata *nd)
 	return result;
 }
 
-int fastcall path_walk(const char * name, struct nameidata *nd)
+static int fastcall path_walk(const char * name, struct nameidata *nd)
 {
 	current->total_link_count = 0;
 	return link_path_walk(name, nd);
@@ -2809,7 +2809,6 @@ EXPORT_SYMBOL(page_symlink_inode_operations);
 EXPORT_SYMBOL(path_lookup);
 EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(path_release);
-EXPORT_SYMBOL(path_walk);
 EXPORT_SYMBOL(permission);
 EXPORT_SYMBOL(vfs_permission);
 EXPORT_SYMBOL(file_permission);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 18ea81265068..6c38efbd810f 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -71,7 +71,6 @@ extern int FASTCALL(__user_walk_fd(int dfd, const char __user *, unsigned, struc
 extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *));
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct nameidata *);
-extern int FASTCALL(path_walk(const char *, struct nameidata *));
 extern void path_release(struct nameidata *);
 extern void path_release_on_umount(struct nameidata *);
 
-- 
cgit v1.3-8-gc7d7


From 6c5d523826dc639df709ed0f88c5d2ce25379652 Mon Sep 17 00:00:00 2001
From: "Kawai, Hidehiro" <hidehiro.kawai.ez@hitachi.com>
Date: Thu, 19 Jul 2007 01:48:27 -0700
Subject: coredump masking: reimplementation of dumpable using two flags

This patch changes mm_struct.dumpable to a pair of bit flags.

set_dumpable() converts three-value dumpable to two flags and stores it into
lower two bits of mm_struct.flags instead of mm_struct.dumpable.
get_dumpable() behaves in the opposite way.

[akpm@linux-foundation.org: export set_dumpable]
Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                    | 62 +++++++++++++++++++++++++++++++++++++++-----
 fs/proc/base.c               |  2 +-
 include/asm-ia64/processor.h |  4 +--
 include/linux/sched.h        |  9 ++++++-
 kernel/ptrace.c              |  2 +-
 kernel/sys.c                 | 24 ++++++++---------
 security/commoncap.c         |  2 +-
 security/dummy.c             |  2 +-
 8 files changed, 82 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 498f2b3dca20..7bdea7937ee8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1058,9 +1058,9 @@ int flush_old_exec(struct linux_binprm * bprm)
 	current->sas_ss_sp = current->sas_ss_size = 0;
 
 	if (current->euid == current->uid && current->egid == current->gid)
-		current->mm->dumpable = 1;
+		set_dumpable(current->mm, 1);
 	else
-		current->mm->dumpable = suid_dumpable;
+		set_dumpable(current->mm, suid_dumpable);
 
 	name = bprm->filename;
 
@@ -1088,7 +1088,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 	    file_permission(bprm->file, MAY_READ) ||
 	    (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
 		suid_keys(current);
-		current->mm->dumpable = suid_dumpable;
+		set_dumpable(current->mm, suid_dumpable);
 	}
 
 	/* An exec changes our domain. We are no longer part of the thread
@@ -1665,6 +1665,56 @@ fail:
 	return core_waiters;
 }
 
+/*
+ * set_dumpable converts traditional three-value dumpable to two flags and
+ * stores them into mm->flags.  It modifies lower two bits of mm->flags, but
+ * these bits are not changed atomically.  So get_dumpable can observe the
+ * intermediate state.  To avoid doing unexpected behavior, get get_dumpable
+ * return either old dumpable or new one by paying attention to the order of
+ * modifying the bits.
+ *
+ * dumpable |   mm->flags (binary)
+ * old  new | initial interim  final
+ * ---------+-----------------------
+ *  0    1  |   00      01      01
+ *  0    2  |   00      10(*)   11
+ *  1    0  |   01      00      00
+ *  1    2  |   01      11      11
+ *  2    0  |   11      10(*)   00
+ *  2    1  |   11      11      01
+ *
+ * (*) get_dumpable regards interim value of 10 as 11.
+ */
+void set_dumpable(struct mm_struct *mm, int value)
+{
+	switch (value) {
+	case 0:
+		clear_bit(MMF_DUMPABLE, &mm->flags);
+		smp_wmb();
+		clear_bit(MMF_DUMP_SECURELY, &mm->flags);
+		break;
+	case 1:
+		set_bit(MMF_DUMPABLE, &mm->flags);
+		smp_wmb();
+		clear_bit(MMF_DUMP_SECURELY, &mm->flags);
+		break;
+	case 2:
+		set_bit(MMF_DUMP_SECURELY, &mm->flags);
+		smp_wmb();
+		set_bit(MMF_DUMPABLE, &mm->flags);
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(set_dumpable);
+
+int get_dumpable(struct mm_struct *mm)
+{
+	int ret;
+
+	ret = mm->flags & 0x3;
+	return (ret >= 2) ? 2 : ret;
+}
+
 int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 {
 	char corename[CORENAME_MAX_SIZE + 1];
@@ -1683,7 +1733,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	if (!binfmt || !binfmt->core_dump)
 		goto fail;
 	down_write(&mm->mmap_sem);
-	if (!mm->dumpable) {
+	if (!get_dumpable(mm)) {
 		up_write(&mm->mmap_sem);
 		goto fail;
 	}
@@ -1693,11 +1743,11 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	 *	process nor do we know its entire history. We only know it
 	 *	was tainted so we dump it as root in mode 2.
 	 */
-	if (mm->dumpable == 2) {	/* Setuid core dump mode */
+	if (get_dumpable(mm) == 2) {	/* Setuid core dump mode */
 		flag = O_EXCL;		/* Stop rewrite attacks */
 		current->fsuid = 0;	/* Dump root private */
 	}
-	mm->dumpable = 0;
+	set_dumpable(mm, 0);
 
 	retval = coredump_wait(exit_code);
 	if (retval < 0)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 42cb4f5613b6..49b3ab0175e0 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1014,7 +1014,7 @@ static int task_dumpable(struct task_struct *task)
 	task_lock(task);
 	mm = task->mm;
 	if (mm)
-		dumpable = mm->dumpable;
+		dumpable = get_dumpable(mm);
 	task_unlock(task);
 	if(dumpable == 1)
 		return 1;
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index db81ba406cef..6251c76437d2 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -295,9 +295,9 @@ struct thread_struct {
 	regs->ar_bspstore = current->thread.rbs_bot;						\
 	regs->ar_fpsr = FPSR_DEFAULT;								\
 	regs->loadrs = 0;									\
-	regs->r8 = current->mm->dumpable;	/* set "don't zap registers" flag */		\
+	regs->r8 = get_dumpable(current->mm);	/* set "don't zap registers" flag */		\
 	regs->r12 = new_sp - 16;	/* allocate 16 byte scratch area */			\
-	if (unlikely(!current->mm->dumpable)) {							\
+	if (unlikely(!get_dumpable(current->mm))) {							\
 		/*										\
 		 * Zap scratch regs to avoid leaking bits between processes with different	\
 		 * uid/privileges.								\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 731edaca8ffd..8dbd08366400 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -345,6 +345,13 @@ typedef unsigned long mm_counter_t;
 		(mm)->hiwater_vm = (mm)->total_vm;	\
 } while (0)
 
+extern void set_dumpable(struct mm_struct *mm, int value);
+extern int get_dumpable(struct mm_struct *mm);
+
+/* mm flags */
+#define MMF_DUMPABLE      0  /* core dump is permitted */
+#define MMF_DUMP_SECURELY 1  /* core file is readable only by root */
+
 struct mm_struct {
 	struct vm_area_struct * mmap;		/* list of VMAs */
 	struct rb_root mm_rb;
@@ -402,7 +409,7 @@ struct mm_struct {
 	unsigned int token_priority;
 	unsigned int last_interval;
 
-	unsigned char dumpable:2;
+	unsigned long flags; /* Must use atomic bitops to access the bits */
 
 	/* coredumping support */
 	int core_waiters;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4a1745f1dadf..82a558b655da 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -142,7 +142,7 @@ static int may_attach(struct task_struct *task)
 		return -EPERM;
 	smp_rmb();
 	if (task->mm)
-		dumpable = task->mm->dumpable;
+		dumpable = get_dumpable(task->mm);
 	if (!dumpable && !capable(CAP_SYS_PTRACE))
 		return -EPERM;
 
diff --git a/kernel/sys.c b/kernel/sys.c
index d40e40a9446c..08562f419768 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1036,7 +1036,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 			return -EPERM;
 	}
 	if (new_egid != old_egid) {
-		current->mm->dumpable = suid_dumpable;
+		set_dumpable(current->mm, suid_dumpable);
 		smp_wmb();
 	}
 	if (rgid != (gid_t) -1 ||
@@ -1066,13 +1066,13 @@ asmlinkage long sys_setgid(gid_t gid)
 
 	if (capable(CAP_SETGID)) {
 		if (old_egid != gid) {
-			current->mm->dumpable = suid_dumpable;
+			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
 		current->gid = current->egid = current->sgid = current->fsgid = gid;
 	} else if ((gid == current->gid) || (gid == current->sgid)) {
 		if (old_egid != gid) {
-			current->mm->dumpable = suid_dumpable;
+			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
 		current->egid = current->fsgid = gid;
@@ -1103,7 +1103,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
 	switch_uid(new_user);
 
 	if (dumpclear) {
-		current->mm->dumpable = suid_dumpable;
+		set_dumpable(current->mm, suid_dumpable);
 		smp_wmb();
 	}
 	current->uid = new_ruid;
@@ -1159,7 +1159,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 		return -EAGAIN;
 
 	if (new_euid != old_euid) {
-		current->mm->dumpable = suid_dumpable;
+		set_dumpable(current->mm, suid_dumpable);
 		smp_wmb();
 	}
 	current->fsuid = current->euid = new_euid;
@@ -1209,7 +1209,7 @@ asmlinkage long sys_setuid(uid_t uid)
 		return -EPERM;
 
 	if (old_euid != uid) {
-		current->mm->dumpable = suid_dumpable;
+		set_dumpable(current->mm, suid_dumpable);
 		smp_wmb();
 	}
 	current->fsuid = current->euid = uid;
@@ -1254,7 +1254,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 	}
 	if (euid != (uid_t) -1) {
 		if (euid != current->euid) {
-			current->mm->dumpable = suid_dumpable;
+			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
 		current->euid = euid;
@@ -1304,7 +1304,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 	}
 	if (egid != (gid_t) -1) {
 		if (egid != current->egid) {
-			current->mm->dumpable = suid_dumpable;
+			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
 		current->egid = egid;
@@ -1350,7 +1350,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
 	    uid == current->suid || uid == current->fsuid || 
 	    capable(CAP_SETUID)) {
 		if (uid != old_fsuid) {
-			current->mm->dumpable = suid_dumpable;
+			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
 		current->fsuid = uid;
@@ -1379,7 +1379,7 @@ asmlinkage long sys_setfsgid(gid_t gid)
 	    gid == current->sgid || gid == current->fsgid || 
 	    capable(CAP_SETGID)) {
 		if (gid != old_fsgid) {
-			current->mm->dumpable = suid_dumpable;
+			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
 		current->fsgid = gid;
@@ -2176,14 +2176,14 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			error = put_user(current->pdeath_signal, (int __user *)arg2);
 			break;
 		case PR_GET_DUMPABLE:
-			error = current->mm->dumpable;
+			error = get_dumpable(current->mm);
 			break;
 		case PR_SET_DUMPABLE:
 			if (arg2 < 0 || arg2 > 1) {
 				error = -EINVAL;
 				break;
 			}
-			current->mm->dumpable = arg2;
+			set_dumpable(current->mm, arg2);
 			break;
 
 		case PR_SET_UNALIGN:
diff --git a/security/commoncap.c b/security/commoncap.c
index 384379ede4fd..338606eb7238 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -148,7 +148,7 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 
 	if (bprm->e_uid != current->uid || bprm->e_gid != current->gid ||
 	    !cap_issubset (new_permitted, current->cap_permitted)) {
-		current->mm->dumpable = suid_dumpable;
+		set_dumpable(current->mm, suid_dumpable);
 
 		if (unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
 			if (!capable(CAP_SETUID)) {
diff --git a/security/dummy.c b/security/dummy.c
index d6a112ce2975..19d813d5e083 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -130,7 +130,7 @@ static void dummy_bprm_free_security (struct linux_binprm *bprm)
 static void dummy_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 {
 	if (bprm->e_uid != current->uid || bprm->e_gid != current->gid) {
-		current->mm->dumpable = suid_dumpable;
+		set_dumpable(current->mm, suid_dumpable);
 
 		if ((unsafe & ~LSM_UNSAFE_PTRACE_CAP) && !capable(CAP_SETUID)) {
 			bprm->e_uid = current->uid;
-- 
cgit v1.3-8-gc7d7


From 3cb4a0bb1e773e3c41800b33a3f7dab32bd06c64 Mon Sep 17 00:00:00 2001
From: "Kawai, Hidehiro" <hidehiro.kawai.ez@hitachi.com>
Date: Thu, 19 Jul 2007 01:48:28 -0700
Subject: coredump masking: add an interface for core dump filter

This patch adds an interface to set/reset flags which determines each memory
segment should be dumped or not when a core file is generated.

/proc/<pid>/coredump_filter file is provided to access the flags.  You can
change the flag status for a particular process by writing to or reading from
the file.

The flag status is inherited to the child process when it is created.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c        | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sched.h | 14 ++++++++
 kernel/fork.c         |  2 ++
 3 files changed, 105 insertions(+)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 49b3ab0175e0..3c77d5a64e7c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -72,6 +72,7 @@
 #include <linux/poll.h>
 #include <linux/nsproxy.h>
 #include <linux/oom.h>
+#include <linux/elf.h>
 #include "internal.h"
 
 /* NOTE:
@@ -1785,6 +1786,91 @@ static const struct inode_operations proc_attr_dir_inode_operations = {
 
 #endif
 
+#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
+					 size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
+	struct mm_struct *mm;
+	char buffer[PROC_NUMBUF];
+	size_t len;
+	int ret;
+
+	if (!task)
+		return -ESRCH;
+
+	ret = 0;
+	mm = get_task_mm(task);
+	if (mm) {
+		len = snprintf(buffer, sizeof(buffer), "%08lx\n",
+			       ((mm->flags & MMF_DUMP_FILTER_MASK) >>
+				MMF_DUMP_FILTER_SHIFT));
+		mmput(mm);
+		ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
+	}
+
+	put_task_struct(task);
+
+	return ret;
+}
+
+static ssize_t proc_coredump_filter_write(struct file *file,
+					  const char __user *buf,
+					  size_t count,
+					  loff_t *ppos)
+{
+	struct task_struct *task;
+	struct mm_struct *mm;
+	char buffer[PROC_NUMBUF], *end;
+	unsigned int val;
+	int ret;
+	int i;
+	unsigned long mask;
+
+	ret = -EFAULT;
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		goto out_no_task;
+
+	ret = -EINVAL;
+	val = (unsigned int)simple_strtoul(buffer, &end, 0);
+	if (*end == '\n')
+		end++;
+	if (end - buffer == 0)
+		goto out_no_task;
+
+	ret = -ESRCH;
+	task = get_proc_task(file->f_dentry->d_inode);
+	if (!task)
+		goto out_no_task;
+
+	ret = end - buffer;
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_no_mm;
+
+	for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
+		if (val & mask)
+			set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+		else
+			clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+	}
+
+	mmput(mm);
+ out_no_mm:
+	put_task_struct(task);
+ out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_coredump_filter_operations = {
+	.read		= proc_coredump_filter_read,
+	.write		= proc_coredump_filter_write,
+};
+#endif
+
 /*
  * /proc/self:
  */
@@ -2005,6 +2091,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
 #endif
+#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+	REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
+#endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 	INF("io",	S_IRUGO, pid_io_accounting),
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8dbd08366400..94f624aef017 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -349,8 +349,22 @@ extern void set_dumpable(struct mm_struct *mm, int value);
 extern int get_dumpable(struct mm_struct *mm);
 
 /* mm flags */
+/* dumpable bits */
 #define MMF_DUMPABLE      0  /* core dump is permitted */
 #define MMF_DUMP_SECURELY 1  /* core file is readable only by root */
+#define MMF_DUMPABLE_BITS 2
+
+/* coredump filter bits */
+#define MMF_DUMP_ANON_PRIVATE	2
+#define MMF_DUMP_ANON_SHARED	3
+#define MMF_DUMP_MAPPED_PRIVATE	4
+#define MMF_DUMP_MAPPED_SHARED	5
+#define MMF_DUMP_FILTER_SHIFT	MMF_DUMPABLE_BITS
+#define MMF_DUMP_FILTER_BITS	4
+#define MMF_DUMP_FILTER_MASK \
+	(((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
+#define MMF_DUMP_FILTER_DEFAULT \
+	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED))
 
 struct mm_struct {
 	struct vm_area_struct * mmap;		/* list of VMAs */
diff --git a/kernel/fork.c b/kernel/fork.c
index ba39bdb2a7b8..469838998220 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -334,6 +334,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
 	atomic_set(&mm->mm_count, 1);
 	init_rwsem(&mm->mmap_sem);
 	INIT_LIST_HEAD(&mm->mmlist);
+	mm->flags = (current->mm) ? current->mm->flags
+				  : MMF_DUMP_FILTER_DEFAULT;
 	mm->core_waiters = 0;
 	mm->nr_ptes = 0;
 	set_mm_counter(mm, file_rss, 0);
-- 
cgit v1.3-8-gc7d7


From d9664c95afe5baa92ea56eff6a1c18e7b7a2cbe7 Mon Sep 17 00:00:00 2001
From: Jan Harkes <jaharkes@cs.cmu.edu>
Date: Thu, 19 Jul 2007 01:48:46 -0700
Subject: coda: block signals during upcall processing

We ignore signals for about 30 seconds to give userspace a chance to see the
upcall.  As we did not block signals we ended up in a busy loop for the
remainder of the period when a signal is received.

Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/upcall.c           | 81 ++++++++++++++++++++++++++++++++++------------
 include/linux/coda_psdev.h |  1 -
 2 files changed, 60 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 44332efa8411..ad65ee01790f 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -638,42 +638,83 @@ int venus_statfs(struct dentry *dentry, struct kstatfs *sfs)
 
 /*
  * coda_upcall and coda_downcall routines.
- * 
  */
+static void block_signals(sigset_t *old)
+{
+	spin_lock_irq(&current->sighand->siglock);
+	*old = current->blocked;
+
+	sigfillset(&current->blocked);
+	sigdelset(&current->blocked, SIGKILL);
+	sigdelset(&current->blocked, SIGSTOP);
+	sigdelset(&current->blocked, SIGINT);
+
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+}
+
+static void unblock_signals(sigset_t *old)
+{
+	spin_lock_irq(&current->sighand->siglock);
+	current->blocked = *old;
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+}
+
+/* Don't allow signals to interrupt the following upcalls before venus
+ * has seen them,
+ * - CODA_CLOSE or CODA_RELEASE upcall  (to avoid reference count problems)
+ * - CODA_STORE				(to avoid data loss)
+ */
+#define CODA_INTERRUPTIBLE(r) (!coda_hard && \
+			       (((r)->uc_opcode != CODA_CLOSE && \
+				 (r)->uc_opcode != CODA_STORE && \
+				 (r)->uc_opcode != CODA_RELEASE) || \
+				(r)->uc_flags & REQ_READ))
 
-static inline void coda_waitfor_upcall(struct upc_req *vmp)
+static inline void coda_waitfor_upcall(struct upc_req *req)
 {
 	DECLARE_WAITQUEUE(wait, current);
+	unsigned long timeout = jiffies + coda_timeout * HZ;
+	sigset_t old;
+	int blocked;
 
-	vmp->uc_posttime = jiffies;
+	block_signals(&old);
+	blocked = 1;
 
-	add_wait_queue(&vmp->uc_sleep, &wait);
+	add_wait_queue(&req->uc_sleep, &wait);
 	for (;;) {
-		if ( !coda_hard && vmp->uc_opcode != CODA_CLOSE ) 
+		if (CODA_INTERRUPTIBLE(req))
 			set_current_state(TASK_INTERRUPTIBLE);
 		else
 			set_current_state(TASK_UNINTERRUPTIBLE);
 
 		/* got a reply */
-		if ( vmp->uc_flags & ( REQ_WRITE | REQ_ABORT ) )
+		if (req->uc_flags & (REQ_WRITE | REQ_ABORT))
 			break;
 
-		if ( !coda_hard && vmp->uc_opcode != CODA_CLOSE && signal_pending(current) ) {
-			/* if this process really wants to die, let it go */
-			if ( sigismember(&(current->pending.signal), SIGKILL) ||
-			     sigismember(&(current->pending.signal), SIGINT) )
-				break;
-			/* signal is present: after timeout always return 
-			   really smart idea, probably useless ... */
-			if ( jiffies - vmp->uc_posttime > coda_timeout * HZ )
-				break; 
+		if (blocked && time_after(jiffies, timeout) &&
+		    CODA_INTERRUPTIBLE(req))
+		{
+			unblock_signals(&old);
+			blocked = 0;
 		}
-		schedule();
+
+		if (signal_pending(current)) {
+			list_del(&req->uc_chain);
+			break;
+		}
+
+		if (blocked)
+			schedule_timeout(HZ);
+		else
+			schedule();
 	}
-	remove_wait_queue(&vmp->uc_sleep, &wait);
-	set_current_state(TASK_RUNNING);
+	if (blocked)
+		unblock_signals(&old);
 
-	return;
+	remove_wait_queue(&req->uc_sleep, &wait);
+	set_current_state(TASK_RUNNING);
 }
 
 
@@ -750,8 +791,6 @@ static int coda_upcall(struct coda_sb_info *sbi,
 		goto exit;
 	}
 
-	list_del(&(req->uc_chain));
-
 	/* Interrupted before venus read it. */
 	if (!(req->uc_flags & REQ_READ))
 		goto exit;
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index b541bb3d1f4b..f28c2f7fd454 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -85,7 +85,6 @@ struct upc_req {
 	u_short	            uc_opcode;  /* copied from data to save lookup */
 	int		    uc_unique;
 	wait_queue_head_t   uc_sleep;   /* process' wait queue */
-	unsigned long       uc_posttime;
 };
 
 #define REQ_ASYNC  0x1
-- 
cgit v1.3-8-gc7d7


From a1b0aa87647493c0201821ab884e86298d5da7d6 Mon Sep 17 00:00:00 2001
From: Jan Harkes <jaharkes@cs.cmu.edu>
Date: Thu, 19 Jul 2007 01:48:50 -0700
Subject: coda: remove struct coda_sb_info

The sb_info structure only contains a single pointer to the character device,
there is no need for the added indirection.

Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/cache.c            |  5 ---
 fs/coda/inode.c            | 32 ++++++-------------
 fs/coda/upcall.c           | 79 ++++++++++++++++++++++------------------------
 include/linux/coda_psdev.h |  9 ++----
 4 files changed, 49 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 11538a2b5423..8a2370341c7a 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -49,11 +49,6 @@ void coda_cache_clear_inode(struct inode *inode)
 /* remove all acl caches */
 void coda_cache_clear_all(struct super_block *sb)
 {
-        struct coda_sb_info *sbi;
-
-        sbi = coda_sbp(sb);
-	BUG_ON(!sbi);
-
 	atomic_inc(&permission_epoch);
 }
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 29e441765600..6771a4271e33 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -141,11 +141,10 @@ static int get_device_index(struct coda_mount_data *data)
 
 static int coda_fill_super(struct super_block *sb, void *data, int silent)
 {
-        struct inode *root = NULL; 
-	struct coda_sb_info *sbi = NULL;
+	struct inode *root = NULL;
 	struct venus_comm *vc = NULL;
 	struct CodaFid fid;
-        int error;
+	int error;
 	int idx;
 
 	idx = get_device_index((struct coda_mount_data *) data);
@@ -167,16 +166,9 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 		return -EBUSY;
 	}
 
-	sbi = kmalloc(sizeof(struct coda_sb_info), GFP_KERNEL);
-	if(!sbi) {
-		return -ENOMEM;
-	}
-
 	vc->vc_sb = sb;
 
-	sbi->sbi_vcomm = vc;
-
-	sb->s_fs_info = sbi;
+	sb->s_fs_info = vc;
 	sb->s_flags |= MS_NOATIME;
 	sb->s_blocksize = 4096;	/* XXXXX  what do we put here?? */
 	sb->s_blocksize_bits = 12;
@@ -207,26 +199,20 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
         return 0;
 
  error:
-	if (sbi) {
-		kfree(sbi);
-		if(vc)
-			vc->vc_sb = NULL;		
-	}
 	if (root)
-                iput(root);
+		iput(root);
+	if (vc)
+		vc->vc_sb = NULL;
 
-        return -EINVAL;
+	return -EINVAL;
 }
 
 static void coda_put_super(struct super_block *sb)
 {
-        struct coda_sb_info *sbi;
-
-	sbi = coda_sbp(sb);
-	sbi->sbi_vcomm->vc_sb = NULL;
+	coda_vcp(sb)->vc_sb = NULL;
+	sb->s_fs_info = NULL;
 
 	printk("Coda: Bye bye.\n");
-	kfree(sbi);
 }
 
 static void coda_clear_inode(struct inode *inode)
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 87601e147644..9a20a3b1998e 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -37,7 +37,7 @@
 #include <linux/coda_cache.h>
 #include <linux/coda_proc.h> 
 
-static int coda_upcall(struct coda_sb_info *mntinfo, int inSize, int *outSize, 
+static int coda_upcall(struct venus_comm *vc, int inSize, int *outSize,
 		       union inputArgs *buffer);
 
 static void *alloc_upcall(int opcode, int size)
@@ -83,7 +83,7 @@ int venus_rootfid(struct super_block *sb, struct CodaFid *fidp)
         insize = SIZE(root);
         UPARG(CODA_ROOT);
 
-	error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 	if (!error)
 		*fidp = outp->coda_root.VFid;
 
@@ -102,7 +102,7 @@ int venus_getattr(struct super_block *sb, struct CodaFid *fid,
 	UPARG(CODA_GETATTR);
         inp->coda_getattr.VFid = *fid;
 
-	error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 	if (!error)
 		*attr = outp->coda_getattr.attr;
 
@@ -123,7 +123,7 @@ int venus_setattr(struct super_block *sb, struct CodaFid *fid,
         inp->coda_setattr.VFid = *fid;
 	inp->coda_setattr.attr = *vattr;
 
-        error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
         CODA_FREE(inp, insize);
         return error;
@@ -149,7 +149,7 @@ int venus_lookup(struct super_block *sb, struct CodaFid *fid,
         memcpy((char *)(inp) + offset, name, length);
         *((char *)inp + offset + length) = '\0';
 
-	error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 	if (!error) {
 		*resfid = outp->coda_lookup.VFid;
 		*type = outp->coda_lookup.vtype;
@@ -182,7 +182,7 @@ int venus_store(struct super_block *sb, struct CodaFid *fid, int flags,
         inp->coda_store.VFid = *fid;
         inp->coda_store.flags = flags;
 
-        error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
 	CODA_FREE(inp, insize);
         return error;
@@ -200,7 +200,7 @@ int venus_release(struct super_block *sb, struct CodaFid *fid, int flags)
 	inp->coda_release.VFid = *fid;
 	inp->coda_release.flags = flags;
 
-	error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
 	CODA_FREE(inp, insize);
 	return error;
@@ -229,7 +229,7 @@ int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
         inp->coda_close.VFid = *fid;
         inp->coda_close.flags = flags;
 
-        error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
 	CODA_FREE(inp, insize);
         return error;
@@ -248,7 +248,7 @@ int venus_open(struct super_block *sb, struct CodaFid *fid,
 	inp->coda_open_by_fd.VFid = *fid;
 	inp->coda_open_by_fd.flags = flags;
 
-	error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 	if (!error)
 		*fh = outp->coda_open_by_fd.fh;
 
@@ -276,7 +276,7 @@ int venus_mkdir(struct super_block *sb, struct CodaFid *dirfid,
         memcpy((char *)(inp) + offset, name, length);
         *((char *)inp + offset + length) = '\0';
 
-	error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 	if (!error) {
 		*attrs = outp->coda_mkdir.attr;
 		*newfid = outp->coda_mkdir.VFid;
@@ -318,7 +318,7 @@ int venus_rename(struct super_block *sb, struct CodaFid *old_fid,
         memcpy((char *)(inp) + offset, new_name, new_length);
         *((char *)inp + offset + new_length) = '\0';
 
-        error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
 	CODA_FREE(inp, insize);
 	return error;
@@ -347,7 +347,7 @@ int venus_create(struct super_block *sb, struct CodaFid *dirfid,
         memcpy((char *)(inp) + offset, name, length);
         *((char *)inp + offset + length) = '\0';
 
-	error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 	if (!error) {
 		*attrs = outp->coda_create.attr;
 		*newfid = outp->coda_create.VFid;
@@ -373,8 +373,8 @@ int venus_rmdir(struct super_block *sb, struct CodaFid *dirfid,
         inp->coda_rmdir.name = offset;
         memcpy((char *)(inp) + offset, name, length);
 	*((char *)inp + offset + length) = '\0';
-        
-        error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
 	CODA_FREE(inp, insize);
 	return error;
@@ -395,8 +395,8 @@ int venus_remove(struct super_block *sb, struct CodaFid *dirfid,
         inp->coda_remove.name = offset;
         memcpy((char *)(inp) + offset, name, length);
 	*((char *)inp + offset + length) = '\0';
-        
-        error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
 	CODA_FREE(inp, insize);
 	return error;
@@ -417,7 +417,7 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid,
 
         inp->coda_readlink.VFid = *fid;
 
-	error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 	if (!error) {
 		retlen = outp->coda_readlink.count;
 		if ( retlen > *length )
@@ -453,8 +453,8 @@ int venus_link(struct super_block *sb, struct CodaFid *fid,
         /* make sure strings are null terminated */
         memcpy((char *)(inp) + offset, name, len);
         *((char *)inp + offset + len) = '\0';
-        
-        error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
 	CODA_FREE(inp, insize);
         return error;
@@ -489,7 +489,7 @@ int venus_symlink(struct super_block *sb, struct CodaFid *fid,
         memcpy((char *)(inp) + offset, name, len);
         *((char *)inp + offset + len) = '\0';
 
-	error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
 	CODA_FREE(inp, insize);
         return error;
@@ -504,9 +504,9 @@ int venus_fsync(struct super_block *sb, struct CodaFid *fid)
 	insize=SIZE(fsync);
 	UPARG(CODA_FSYNC);
 
-        inp->coda_fsync.VFid = *fid;
-        error = coda_upcall(coda_sbp(sb), sizeof(union inputArgs), 
-                            &outsize, inp);
+	inp->coda_fsync.VFid = *fid;
+	error = coda_upcall(coda_vcp(sb), sizeof(union inputArgs),
+			    &outsize, inp);
 
 	CODA_FREE(inp, insize);
 	return error;
@@ -524,7 +524,7 @@ int venus_access(struct super_block *sb, struct CodaFid *fid, int mask)
         inp->coda_access.VFid = *fid;
         inp->coda_access.flags = mask;
 
-	error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
 	CODA_FREE(inp, insize);
 	return error;
@@ -573,9 +573,9 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
 	        goto exit;
 	}
 
-        error = coda_upcall(coda_sbp(sb), SIZE(ioctl) + data->vi.in_size,
-                            &outsize, inp);
-        
+	error = coda_upcall(coda_vcp(sb), SIZE(ioctl) + data->vi.in_size,
+			    &outsize, inp);
+
         if (error) {
 	        printk("coda_pioctl: Venus returns: %d for %s\n", 
 		       error, coda_f2s(fid));
@@ -615,7 +615,7 @@ int venus_statfs(struct dentry *dentry, struct kstatfs *sfs)
 	insize = max_t(unsigned int, INSIZE(statfs), OUTSIZE(statfs));
 	UPARG(CODA_STATFS);
 
-	error = coda_upcall(coda_sbp(dentry->d_sb), insize, &outsize, inp);
+	error = coda_upcall(coda_vcp(dentry->d_sb), insize, &outsize, inp);
 	if (!error) {
 		sfs->f_blocks = outp->coda_statfs.stat.f_blocks;
 		sfs->f_bfree  = outp->coda_statfs.stat.f_bfree;
@@ -710,28 +710,25 @@ static inline void coda_waitfor_upcall(struct upc_req *req)
 }
 
 
-/* 
- * coda_upcall will return an error in the case of 
+/*
+ * coda_upcall will return an error in the case of
  * failed communication with Venus _or_ will peek at Venus
  * reply and return Venus' error.
  *
  * As venus has 2 types of errors, normal errors (positive) and internal
  * errors (negative), normal errors are negated, while internal errors
  * are all mapped to -EINTR, while showing a nice warning message. (jh)
- * 
  */
-static int coda_upcall(struct coda_sb_info *sbi,
+static int coda_upcall(struct venus_comm *vcp,
 		       int inSize, int *outSize,
 		       union inputArgs *buffer)
 {
-	struct venus_comm *vcommp;
 	union outputArgs *out;
 	union inputArgs *sig_inputArgs;
 	struct upc_req *req, *sig_req;
 	int error = 0;
 
-	vcommp = sbi->sbi_vcomm;
-	if (!vcommp->vc_inuse) {
+	if (!vcp->vc_inuse) {
 		printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n");
 		return -ENXIO;
 	}
@@ -746,16 +743,16 @@ static int coda_upcall(struct coda_sb_info *sbi,
 	req->uc_inSize = inSize;
 	req->uc_outSize = *outSize ? *outSize : inSize;
 	req->uc_opcode = ((union inputArgs *)buffer)->ih.opcode;
-	req->uc_unique = ++vcommp->vc_seq;
+	req->uc_unique = ++vcp->vc_seq;
 	init_waitqueue_head(&req->uc_sleep);
 
 	/* Fill in the common input args. */
 	((union inputArgs *)buffer)->ih.unique = req->uc_unique;
 
 	/* Append msg to pending queue and poke Venus. */
-	list_add_tail(&req->uc_chain, &vcommp->vc_pending);
+	list_add_tail(&req->uc_chain, &vcp->vc_pending);
 
-	wake_up_interruptible(&vcommp->vc_waitq);
+	wake_up_interruptible(&vcp->vc_waitq);
 	/* We can be interrupted while we wait for Venus to process
 	 * our request.  If the interrupt occurs before Venus has read
 	 * the request, we dequeue and return. If it occurs after the
@@ -788,7 +785,7 @@ static int coda_upcall(struct coda_sb_info *sbi,
 		goto exit;
 
 	/* Venus saw the upcall, make sure we can send interrupt signal */
-	if (!vcommp->vc_inuse) {
+	if (!vcp->vc_inuse) {
 		printk(KERN_INFO "coda: Venus dead, not sending signal.\n");
 		goto exit;
 	}
@@ -815,8 +812,8 @@ static int coda_upcall(struct coda_sb_info *sbi,
 	sig_req->uc_outSize = sizeof(struct coda_in_hdr);
 
 	/* insert at head of queue! */
-	list_add(&(sig_req->uc_chain), &vcommp->vc_pending);
-	wake_up_interruptible(&vcommp->vc_waitq);
+	list_add(&(sig_req->uc_chain), &vcp->vc_pending);
+	wake_up_interruptible(&vcp->vc_waitq);
 
 exit:
 	kfree(req);
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index f28c2f7fd454..81b2e4c7d7ce 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -8,11 +8,6 @@
 
 struct kstatfs;
 
-struct coda_sb_info
-{
-	struct venus_comm *sbi_vcomm;
-};
-
 /* communication pending/processing queues */
 struct venus_comm {
 	u_long		    vc_seq;
@@ -24,9 +19,9 @@ struct venus_comm {
 };
 
 
-static inline struct coda_sb_info *coda_sbp(struct super_block *sb)
+static inline struct venus_comm *coda_vcp(struct super_block *sb)
 {
-    return ((struct coda_sb_info *)((sb)->s_fs_info));
+	return (struct venus_comm *)((sb)->s_fs_info);
 }
 
 
-- 
cgit v1.3-8-gc7d7


From 3cf01f28c303be34f18cb4f6204cf1bdfe12ba7c Mon Sep 17 00:00:00 2001
From: Jan Harkes <jaharkes@cs.cmu.edu>
Date: Thu, 19 Jul 2007 01:48:51 -0700
Subject: coda: remove statistics counters from /proc/fs/coda

Similar information can easily be obtained with strace -c.

Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/coda_int.h         |   7 ++
 fs/coda/dir.c              |  16 +---
 fs/coda/file.c             |  10 +-
 fs/coda/psdev.c            |   1 -
 fs/coda/symlink.c          |   2 -
 fs/coda/sysctl.c           | 228 +--------------------------------------------
 fs/coda/upcall.c           |   3 +-
 include/linux/coda_linux.h |   3 -
 include/linux/coda_proc.h  |  76 ---------------
 include/linux/coda_psdev.h |   2 -
 10 files changed, 15 insertions(+), 333 deletions(-)
 delete mode 100644 include/linux/coda_proc.h

(limited to 'include/linux')

diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index 9e6338fea514..8ccd5ed81d9c 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -1,12 +1,19 @@
 #ifndef _CODA_INT_
 #define _CODA_INT_
 
+struct dentry;
+
 extern struct file_system_type coda_fs_type;
+extern unsigned long coda_timeout;
+extern int coda_hard;
+extern int coda_fake_statfs;
 
 void coda_destroy_inodecache(void);
 int coda_init_inodecache(void);
 int coda_fsync(struct file *coda_file, struct dentry *coda_dentry,
 	       int datasync);
+void coda_sysctl_init(void);
+void coda_sysctl_clean(void);
 
 #endif  /*  _CODA_INT_  */
 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 0c6c48ca7496..04a3dd84c993 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -25,7 +25,6 @@
 #include <linux/coda_psdev.h>
 #include <linux/coda_fs_i.h>
 #include <linux/coda_cache.h>
-#include <linux/coda_proc.h>
 
 #include "coda_int.h"
 
@@ -148,8 +147,6 @@ int coda_permission(struct inode *inode, int mask, struct nameidata *nd)
 
 	lock_kernel();
 
-	coda_vfs_stat.permission++;
-
 	if (coda_cache_check(inode, mask))
 		goto out; 
 
@@ -206,7 +203,6 @@ static int coda_create(struct inode *dir, struct dentry *de, int mode, struct na
 	struct coda_vattr attrs;
 
 	lock_kernel();
-	coda_vfs_stat.create++;
 
 	if (coda_isroot(dir) && coda_iscontrol(name, length)) {
 		unlock_kernel();
@@ -246,7 +242,6 @@ static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
 	struct CodaFid newfid;
 
 	lock_kernel();
-	coda_vfs_stat.mkdir++;
 
 	if (coda_isroot(dir) && coda_iscontrol(name, len)) {
 		unlock_kernel();
@@ -288,7 +283,6 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode,
 	int error;
 
 	lock_kernel();
-	coda_vfs_stat.link++;
 
 	if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) {
 		unlock_kernel();
@@ -320,10 +314,9 @@ static int coda_symlink(struct inode *dir_inode, struct dentry *de,
         const char *name = de->d_name.name;
 	int len = de->d_name.len;
 	int symlen;
-        int error=0;
-        
+	int error = 0;
+
 	lock_kernel();
-	coda_vfs_stat.symlink++;
 
 	if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) {
 		unlock_kernel();
@@ -360,7 +353,6 @@ int coda_unlink(struct inode *dir, struct dentry *de)
 	int len = de->d_name.len;
 
 	lock_kernel();
-	coda_vfs_stat.unlink++;
 
 	error = venus_remove(dir->i_sb, coda_i2f(dir), name, len);
 	if ( error ) {
@@ -381,7 +373,6 @@ int coda_rmdir(struct inode *dir, struct dentry *de)
 	int error;
 
 	lock_kernel();
-	coda_vfs_stat.rmdir++;
 
 	error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
 	if (!error) {
@@ -408,7 +399,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int error;
 
 	lock_kernel();
-	coda_vfs_stat.rename++;
 
 	error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
 			     coda_i2f(new_dir), old_length, new_length,
@@ -445,8 +435,6 @@ int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	coda_vfs_stat.readdir++;
-
 	if (!host_file->f_op)
 		return -ENOTDIR;
 
diff --git a/fs/coda/file.c b/fs/coda/file.c
index e7d622709c90..7594962604c2 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -22,7 +22,6 @@
 #include <linux/coda_linux.h>
 #include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_proc.h>
 
 #include "coda_int.h"
 
@@ -134,8 +133,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
 	unsigned short coda_flags = coda_flags_to_cflags(flags);
 	struct coda_file_info *cfi;
 
-	coda_vfs_stat.open++;
-
 	cfi = kmalloc(sizeof(struct coda_file_info), GFP_KERNEL);
 	if (!cfi)
 		return -ENOMEM;
@@ -176,8 +173,6 @@ int coda_flush(struct file *coda_file, fl_owner_t id)
 
 	lock_kernel();
 
-	coda_vfs_stat.flush++;
-
 	/* last close semantics */
 	fcnt = file_count(coda_file);
 	if (fcnt > 1)
@@ -219,8 +214,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 	int err = 0;
 
 	lock_kernel();
-	coda_vfs_stat.release++;
- 
+
 	if (!use_coda_close) {
 		err = venus_release(coda_inode->i_sb, coda_i2f(coda_inode),
 				    coda_flags);
@@ -271,8 +265,6 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	coda_vfs_stat.fsync++;
-
 	if (host_file->f_op && host_file->f_op->fsync) {
 		host_dentry = host_file->f_path.dentry;
 		host_inode = host_dentry->d_inode;
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 8a09f19596db..e3a0a4164d5d 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -45,7 +45,6 @@
 #include <linux/coda_linux.h>
 #include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
-#include <linux/coda_proc.h>
 
 #include "coda_int.h"
 
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index 76e00a65a75b..4513b7258458 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -20,7 +20,6 @@
 #include <linux/coda_linux.h>
 #include <linux/coda_psdev.h>
 #include <linux/coda_fs_i.h>
-#include <linux/coda_proc.h>
 
 static int coda_symlink_filler(struct file *file, struct page *page)
 {
@@ -32,7 +31,6 @@ static int coda_symlink_filler(struct file *file, struct page *page)
 
 	lock_kernel();
 	cii = ITOC(inode);
-	coda_vfs_stat.follow_link++;
 
 	error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len);
 	unlock_kernel();
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index c57a1fa7cf23..81b7771c6465 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -5,181 +5,14 @@
  * 
  * Carnegie Mellon encourages users to contribute improvements to
  * the Coda project. Contact Peter Braam (coda@cs.cmu.edu).
- * 
- * CODA operation statistics
- * (c) March, 1998 Zhanyong Wan <zhanyong.wan@yale.edu>
- *
  */
 
-#include <linux/time.h>
-#include <linux/mm.h>
 #include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/stat.h>
-#include <linux/ctype.h>
-#include <linux/bitops.h>
-#include <asm/uaccess.h>
-#include <linux/utsname.h>
-#include <linux/module.h>
 
-#include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
-#include <linux/coda_psdev.h>
-#include <linux/coda_cache.h>
-#include <linux/coda_proc.h>
+#include "coda_int.h"
 
 static struct ctl_table_header *fs_table_header;
 
-#define CODA_TIMEOUT    3       /* timeout on upcalls to become intrble */
-#define CODA_HARD       5       /* mount type "hard" or "soft" */
-#define CODA_VFS 	 6       /* vfs statistics */
-#define CODA_CACHE_INV 	 9       /* cache invalidation statistics */
-#define CODA_FAKE_STATFS 10	 /* don't query venus for actual cache usage */
-
-struct coda_vfs_stats		coda_vfs_stat;
-static struct coda_cache_inv_stats	coda_cache_inv_stat;
-
-static void reset_coda_vfs_stats( void )
-{
-	memset( &coda_vfs_stat, 0, sizeof( coda_vfs_stat ) );
-}
-
-static void reset_coda_cache_inv_stats( void )
-{
-	memset( &coda_cache_inv_stat, 0, sizeof( coda_cache_inv_stat ) );
-}
-
-static int do_reset_coda_vfs_stats( ctl_table * table, int write,
-				    struct file * filp, void __user * buffer,
-				    size_t * lenp, loff_t * ppos )
-{
-	if ( write ) {
-		reset_coda_vfs_stats();
-
-		*ppos += *lenp;
-	} else {
-		*lenp = 0;
-	}
-
-	return 0;
-}
-
-static int do_reset_coda_cache_inv_stats( ctl_table * table, int write,
-					  struct file * filp,
-					  void __user * buffer,
-					  size_t * lenp, loff_t * ppos )
-{
-	if ( write ) {
-		reset_coda_cache_inv_stats();
-
-		*ppos += *lenp;
-	} else {
-		*lenp = 0;
-	}
-  
-	return 0;
-}
-
-static int proc_vfs_stats_show(struct seq_file *m, void *v)
-{
-	struct coda_vfs_stats * ps = & coda_vfs_stat;
-  
-	seq_printf(m,
-			"Coda VFS statistics\n"
-			"===================\n\n"
-			"File Operations:\n"
-			"\topen\t\t%9d\n"
-			"\tflush\t\t%9d\n"
-			"\trelease\t\t%9d\n"
-			"\tfsync\t\t%9d\n\n"
-			"Dir Operations:\n"
-			"\treaddir\t\t%9d\n\n"
-			"Inode Operations\n"
-			"\tcreate\t\t%9d\n"
-			"\tlookup\t\t%9d\n"
-			"\tlink\t\t%9d\n"
-			"\tunlink\t\t%9d\n"
-			"\tsymlink\t\t%9d\n"
-			"\tmkdir\t\t%9d\n"
-			"\trmdir\t\t%9d\n"
-			"\trename\t\t%9d\n"
-			"\tpermission\t%9d\n",
-
-			/* file operations */
-			ps->open,
-			ps->flush,
-			ps->release,
-			ps->fsync,
-
-			/* dir operations */
-			ps->readdir,
-		  
-			/* inode operations */
-			ps->create,
-			ps->lookup,
-			ps->link,
-			ps->unlink,
-			ps->symlink,
-			ps->mkdir,
-			ps->rmdir,
-			ps->rename,
-			ps->permission); 
-	return 0;
-}
-
-static int proc_cache_inv_stats_show(struct seq_file *m, void *v)
-{
-	struct coda_cache_inv_stats * ps = & coda_cache_inv_stat;
-  
-	seq_printf(m,
-			"Coda cache invalidation statistics\n"
-			"==================================\n\n"
-			"flush\t\t%9d\n"
-			"purge user\t%9d\n"
-			"zap_dir\t\t%9d\n"
-			"zap_file\t%9d\n"
-			"zap_vnode\t%9d\n"
-			"purge_fid\t%9d\n"
-			"replace\t\t%9d\n",
-			ps->flush,
-			ps->purge_user,
-			ps->zap_dir,
-			ps->zap_file,
-			ps->zap_vnode,
-			ps->purge_fid,
-			ps->replace );
-	return 0;
-}
-
-static int proc_vfs_stats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_vfs_stats_show, NULL);
-}
-
-static int proc_cache_inv_stats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_cache_inv_stats_show, NULL);
-}
-
-static const struct file_operations proc_vfs_stats_fops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_vfs_stats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static const struct file_operations proc_cache_inv_stats_fops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_cache_inv_stats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static ctl_table coda_table[] = {
 	{
 		.ctl_name	= CTL_UNNUMBERED,
@@ -197,22 +30,6 @@ static ctl_table coda_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "vfs_stats",
-		.data		= NULL,
-		.maxlen		= 0,
-		.mode		= 0644,
-		.proc_handler	= &do_reset_coda_vfs_stats
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "cache_inv_stats",
-		.data		= NULL,
-		.maxlen		= 0,
-		.mode		= 0644,
-		.proc_handler	= &do_reset_coda_cache_inv_stats
-	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "fake_statfs",
@@ -235,59 +52,20 @@ static ctl_table fs_table[] = {
 };
 
 
-#ifdef CONFIG_PROC_FS
-
-/*
- target directory structure:
-   /proc/fs  (see linux/fs/proc/root.c)
-   /proc/fs/coda
-   /proc/fs/coda/{vfs_stats,
-
-*/
-
-static struct proc_dir_entry* proc_fs_coda;
-
-#endif
-
 void coda_sysctl_init(void)
 {
-	reset_coda_vfs_stats();
-	reset_coda_cache_inv_stats();
-
-#ifdef CONFIG_PROC_FS
-	proc_fs_coda = proc_mkdir("coda", proc_root_fs);
-	if (proc_fs_coda) {
-		struct proc_dir_entry *pde;
-
-		proc_fs_coda->owner = THIS_MODULE;
-		pde = create_proc_entry("vfs_stats", 0, proc_fs_coda);
-		if (pde)
-			pde->proc_fops = &proc_vfs_stats_fops;
-		pde = create_proc_entry("cache_inv_stats", 0, proc_fs_coda);
-		if (pde)
-			pde->proc_fops = &proc_cache_inv_stats_fops;
-	}
-#endif
-
 #ifdef CONFIG_SYSCTL
 	if ( !fs_table_header )
 		fs_table_header = register_sysctl_table(fs_table);
-#endif 
+#endif
 }
 
-void coda_sysctl_clean(void) 
+void coda_sysctl_clean(void)
 {
-
 #ifdef CONFIG_SYSCTL
 	if ( fs_table_header ) {
 		unregister_sysctl_table(fs_table_header);
 		fs_table_header = NULL;
 	}
 #endif
-
-#ifdef CONFIG_PROC_FS
-        remove_proc_entry("cache_inv_stats", proc_fs_coda);
-        remove_proc_entry("vfs_stats", proc_fs_coda);
-	remove_proc_entry("coda", proc_root_fs);
-#endif 
 }
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 9a20a3b1998e..e4e766e5557c 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -35,7 +35,8 @@
 #include <linux/coda_psdev.h>
 #include <linux/coda_fs_i.h>
 #include <linux/coda_cache.h>
-#include <linux/coda_proc.h> 
+
+#include "coda_int.h"
 
 static int coda_upcall(struct venus_comm *vc, int inSize, int *outSize,
 		       union inputArgs *buffer);
diff --git a/include/linux/coda_linux.h b/include/linux/coda_linux.h
index e4ac016ad272..c4079b403e9e 100644
--- a/include/linux/coda_linux.h
+++ b/include/linux/coda_linux.h
@@ -43,9 +43,6 @@ int coda_revalidate_inode(struct dentry *);
 int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int coda_setattr(struct dentry *, struct iattr *);
 
-/* global variables */
-extern int coda_fake_statfs;
-
 /* this file:  heloers */
 static __inline__ struct CodaFid *coda_i2f(struct inode *);
 static __inline__ char *coda_i2s(struct inode *);
diff --git a/include/linux/coda_proc.h b/include/linux/coda_proc.h
deleted file mode 100644
index 0dc1b0458e75..000000000000
--- a/include/linux/coda_proc.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * coda_statis.h
- * 
- * CODA operation statistics
- *
- * (c) March, 1998
- * by Michihiro Kuramochi, Zhenyu Xia and Zhanyong Wan
- * zhanyong.wan@yale.edu
- *
- */
-
-#ifndef _CODA_PROC_H
-#define _CODA_PROC_H
-
-void coda_sysctl_init(void);
-void coda_sysctl_clean(void);
-
-#include <linux/sysctl.h>
-#include <linux/coda_fs_i.h>
-#include <linux/coda.h>
-
-/* these four files are presented to show the result of the statistics:
- *
- *	/proc/fs/coda/vfs_stats
- *		      cache_inv_stats
- *
- * these four files are presented to reset the statistics to 0:
- *
- *	/proc/sys/coda/vfs_stats
- *		       cache_inv_stats
- */
-
-/* VFS operation statistics */
-struct coda_vfs_stats 
-{
-	/* file operations */
-	int open;
-	int flush;
-	int release;
-	int fsync;
-
-	/* dir operations */
-	int readdir;
-  
-	/* inode operations */
-	int create;
-	int lookup;
-	int link;
-	int unlink;
-	int symlink;
-	int mkdir;
-	int rmdir;
-	int rename;
-	int permission;
-
-	/* symlink operatoins*/
-	int follow_link;
-	int readlink;
-};
-
-/* cache invalidation statistics */
-struct coda_cache_inv_stats
-{
-	int flush;
-	int purge_user;
-	int zap_dir;
-	int zap_file;
-	int zap_vnode;
-	int purge_fid;
-	int replace;
-};
-
-/* these global variables hold the actual statistics data */
-extern struct coda_vfs_stats		coda_vfs_stat;
-
-#endif /* _CODA_PROC_H */
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 81b2e4c7d7ce..aa8f454b3b77 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -69,8 +69,6 @@ int venus_statfs(struct dentry *dentry, struct kstatfs *sfs);
 
 
 /* messages between coda filesystem in kernel and Venus */
-extern int coda_hard;
-extern unsigned long coda_timeout;
 struct upc_req {
 	struct list_head    uc_chain;
 	caddr_t	            uc_data;
-- 
cgit v1.3-8-gc7d7


From 21f8ca3bf6198bd21e3c4cc820af2ccf753a6ec8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jul 2007 01:48:53 -0700
Subject: fix raw_spinlock_t vs lockdep

Use the lockdep infrastructure to track lock contention and other lock
statistics.

It tracks lock contention events, and the first four unique call-sites that
encountered contention.

It also measures lock wait-time and hold-time in nanoseconds. The minimum and
maximum times are tracked, as well as a total (which together with the number
of event can give the avg).

All statistics are done per lock class, per write (exclusive state) and per read
(shared state).

The statistics are collected per-cpu, so that the collection overhead is
minimized via having no global cachemisses.

This new lock statistics feature is independent of the lock dependency checking
traditionally done by lockdep; it just shares the lock tracking code. It is
also possible to enable both and runtime disabled either component - thereby
avoiding the O(n^2) lock chain walks for instance.

This patch:

raw_spinlock_t should not use lockdep (and doesn't) since lockdep itself
relies on it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/spinlock_types.h    | 4 ++--
 include/linux/spinlock_types_up.h | 9 +--------
 2 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index 210549ba4ef4..f6a3a951b79e 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -9,14 +9,14 @@
  * Released under the General Public License (GPL).
  */
 
-#include <linux/lockdep.h>
-
 #if defined(CONFIG_SMP)
 # include <asm/spinlock_types.h>
 #else
 # include <linux/spinlock_types_up.h>
 #endif
 
+#include <linux/lockdep.h>
+
 typedef struct {
 	raw_spinlock_t raw_lock;
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h
index 27644af20b7c..04135b0e198e 100644
--- a/include/linux/spinlock_types_up.h
+++ b/include/linux/spinlock_types_up.h
@@ -12,14 +12,10 @@
  * Released under the General Public License (GPL).
  */
 
-#if defined(CONFIG_DEBUG_SPINLOCK) || \
-	defined(CONFIG_DEBUG_LOCK_ALLOC)
+#ifdef CONFIG_DEBUG_SPINLOCK
 
 typedef struct {
 	volatile unsigned int slock;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map dep_map;
-#endif
 } raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED { 1 }
@@ -34,9 +30,6 @@ typedef struct { } raw_spinlock_t;
 
 typedef struct {
 	/* no debug version on UP */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map dep_map;
-#endif
 } raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED { }
-- 
cgit v1.3-8-gc7d7


From f20786ff4da51e56b1956acf30be2552be266746 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jul 2007 01:48:56 -0700
Subject: lockstat: core infrastructure

Introduce the core lock statistics code.

Lock statistics provides lock wait-time and hold-time (as well as the count
of corresponding contention and acquisitions events). Also, the first few
call-sites that encounter contention are tracked.

Lock wait-time is the time spent waiting on the lock. This provides insight
into the locking scheme, that is, a heavily contended lock is indicative of
a too coarse locking scheme.

Lock hold-time is the duration the lock was held, this provides a reference for
the wait-time numbers, so they can be put into perspective.

  1)
    lock
  2)
    ... do stuff ..
    unlock
  3)

The time between 1 and 2 is the wait-time. The time between 2 and 3 is the
hold-time.

The lockdep held-lock tracking code is reused, because it already collects locks
into meaningful groups (classes), and because it is an existing infrastructure
for lock instrumentation.

Currently lockdep tracks lock acquisition with two hooks:

  lock()
    lock_acquire()
    _lock()

 ... code protected by lock ...

  unlock()
    lock_release()
    _unlock()

We need to extend this with two more hooks, in order to measure contention.

  lock_contended() - used to measure contention events
  lock_acquired()  - completion of the contention

These are then placed the following way:

  lock()
    lock_acquire()
    if (!_try_lock())
      lock_contended()
      _lock()
      lock_acquired()

 ... do locked stuff ...

  unlock()
    lock_release()
    _unlock()

(Note: the try_lock() 'trick' is used to avoid instrumenting all platform
       dependent lock primitive implementations.)

It is also possible to toggle the two lockdep features at runtime using:

  /proc/sys/kernel/prove_locking
  /proc/sys/kernel/lock_stat

(esp. turning off the O(n^2) prove_locking functionaliy can help)

[akpm@linux-foundation.org: build fixes]
[akpm@linux-foundation.org: nuke unneeded ifdefs]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lockdep.h |  53 +++++++++++
 kernel/lockdep.c        | 247 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sysctl.c         |  22 +++++
 lib/Kconfig.debug       |  11 +++
 4 files changed, 333 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 14c937d345cb..8f946f614f8e 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -9,6 +9,7 @@
 #define __LINUX_LOCKDEP_H
 
 struct task_struct;
+struct lockdep_map;
 
 #ifdef CONFIG_LOCKDEP
 
@@ -114,8 +115,32 @@ struct lock_class {
 
 	const char			*name;
 	int				name_version;
+
+#ifdef CONFIG_LOCK_STAT
+	unsigned long			contention_point[4];
+#endif
+};
+
+#ifdef CONFIG_LOCK_STAT
+struct lock_time {
+	s64				min;
+	s64				max;
+	s64				total;
+	unsigned long			nr;
 };
 
+struct lock_class_stats {
+	unsigned long			contention_point[4];
+	struct lock_time		read_waittime;
+	struct lock_time		write_waittime;
+	struct lock_time		read_holdtime;
+	struct lock_time		write_holdtime;
+};
+
+struct lock_class_stats lock_stats(struct lock_class *class);
+void clear_lock_stats(struct lock_class *class);
+#endif
+
 /*
  * Map the lock object (the lock instance) to the lock-class object.
  * This is embedded into specific lock instances:
@@ -165,6 +190,10 @@ struct held_lock {
 	unsigned long			acquire_ip;
 	struct lockdep_map		*instance;
 
+#ifdef CONFIG_LOCK_STAT
+	u64 				waittime_stamp;
+	u64				holdtime_stamp;
+#endif
 	/*
 	 * The lock-stack is unified in that the lock chains of interrupt
 	 * contexts nest ontop of process context chains, but we 'separate'
@@ -281,6 +310,30 @@ struct lock_class_key { };
 
 #endif /* !LOCKDEP */
 
+#ifdef CONFIG_LOCK_STAT
+
+extern void lock_contended(struct lockdep_map *lock, unsigned long ip);
+extern void lock_acquired(struct lockdep_map *lock);
+
+#define LOCK_CONTENDED(_lock, try, lock)			\
+do {								\
+	if (!try(_lock)) {					\
+		lock_contended(&(_lock)->dep_map, _RET_IP_);	\
+		lock(_lock);					\
+		lock_acquired(&(_lock)->dep_map);		\
+	}							\
+} while (0)
+
+#else /* CONFIG_LOCK_STAT */
+
+#define lock_contended(lockdep_map, ip) do {} while (0)
+#define lock_acquired(lockdep_map) do {} while (0)
+
+#define LOCK_CONTENDED(_lock, try, lock) \
+	lock(_lock)
+
+#endif /* CONFIG_LOCK_STAT */
+
 #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_GENERIC_HARDIRQS)
 extern void early_init_irq_lock_class(void);
 #else
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 87ac36425070..70ca4db28aff 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,6 +42,20 @@
 
 #include "lockdep_internals.h"
 
+#ifdef CONFIG_PROVE_LOCKING
+int prove_locking = 1;
+module_param(prove_locking, int, 0644);
+#else
+#define prove_locking 0
+#endif
+
+#ifdef CONFIG_LOCK_STAT
+int lock_stat = 1;
+module_param(lock_stat, int, 0644);
+#else
+#define lock_stat 0
+#endif
+
 /*
  * lockdep_lock: protects the lockdep graph, the hashes and the
  *               class/list/hash allocators.
@@ -104,6 +118,70 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
 unsigned long nr_lock_classes;
 static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
 
+#ifdef CONFIG_LOCK_STAT
+static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
+
+static int lock_contention_point(struct lock_class *class, unsigned long ip)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
+		if (class->contention_point[i] == 0) {
+			class->contention_point[i] = ip;
+			break;
+		}
+		if (class->contention_point[i] == ip)
+			break;
+	}
+
+	return i;
+}
+
+static void lock_time_inc(struct lock_time *lt, s64 time)
+{
+	if (time > lt->max)
+		lt->max = time;
+
+	if (time < lt->min || !lt->min)
+		lt->min = time;
+
+	lt->total += time;
+	lt->nr++;
+}
+
+static struct lock_class_stats *get_lock_stats(struct lock_class *class)
+{
+	return &get_cpu_var(lock_stats)[class - lock_classes];
+}
+
+static void put_lock_stats(struct lock_class_stats *stats)
+{
+	put_cpu_var(lock_stats);
+}
+
+static void lock_release_holdtime(struct held_lock *hlock)
+{
+	struct lock_class_stats *stats;
+	s64 holdtime;
+
+	if (!lock_stat)
+		return;
+
+	holdtime = sched_clock() - hlock->holdtime_stamp;
+
+	stats = get_lock_stats(hlock->class);
+	if (hlock->read)
+		lock_time_inc(&stats->read_holdtime, holdtime);
+	else
+		lock_time_inc(&stats->write_holdtime, holdtime);
+	put_lock_stats(stats);
+}
+#else
+static inline void lock_release_holdtime(struct held_lock *hlock)
+{
+}
+#endif
+
 /*
  * We keep a global list of all lock classes. The list only grows,
  * never shrinks. The list is only accessed with the lockdep
@@ -2221,6 +2299,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	int chain_head = 0;
 	u64 chain_key;
 
+	if (!prove_locking)
+		check = 1;
+
 	if (unlikely(!debug_locks))
 		return 0;
 
@@ -2271,6 +2352,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	hlock->read = read;
 	hlock->check = check;
 	hlock->hardirqs_off = hardirqs_off;
+#ifdef CONFIG_LOCK_STAT
+	hlock->waittime_stamp = 0;
+	hlock->holdtime_stamp = sched_clock();
+#endif
 
 	if (check == 2 && !mark_irqflags(curr, hlock))
 		return 0;
@@ -2411,6 +2496,8 @@ lock_release_non_nested(struct task_struct *curr,
 	return print_unlock_inbalance_bug(curr, lock, ip);
 
 found_it:
+	lock_release_holdtime(hlock);
+
 	/*
 	 * We have the right lock to unlock, 'hlock' points to it.
 	 * Now we remove it from the stack, and add back the other
@@ -2463,6 +2550,8 @@ static int lock_release_nested(struct task_struct *curr,
 
 	curr->curr_chain_key = hlock->prev_chain_key;
 
+	lock_release_holdtime(hlock);
+
 #ifdef CONFIG_DEBUG_LOCKDEP
 	hlock->prev_chain_key = 0;
 	hlock->class = NULL;
@@ -2537,6 +2626,9 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 {
 	unsigned long flags;
 
+	if (unlikely(!lock_stat && !prove_locking))
+		return;
+
 	if (unlikely(current->lockdep_recursion))
 		return;
 
@@ -2556,6 +2648,9 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 {
 	unsigned long flags;
 
+	if (unlikely(!lock_stat && !prove_locking))
+		return;
+
 	if (unlikely(current->lockdep_recursion))
 		return;
 
@@ -2569,6 +2664,158 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 
 EXPORT_SYMBOL_GPL(lock_release);
 
+#ifdef CONFIG_LOCK_STAT
+static int
+print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
+			   unsigned long ip)
+{
+	if (!debug_locks_off())
+		return 0;
+	if (debug_locks_silent)
+		return 0;
+
+	printk("\n=================================\n");
+	printk(  "[ BUG: bad contention detected! ]\n");
+	printk(  "---------------------------------\n");
+	printk("%s/%d is trying to contend lock (",
+		curr->comm, curr->pid);
+	print_lockdep_cache(lock);
+	printk(") at:\n");
+	print_ip_sym(ip);
+	printk("but there are no locks held!\n");
+	printk("\nother info that might help us debug this:\n");
+	lockdep_print_held_locks(curr);
+
+	printk("\nstack backtrace:\n");
+	dump_stack();
+
+	return 0;
+}
+
+static void
+__lock_contended(struct lockdep_map *lock, unsigned long ip)
+{
+	struct task_struct *curr = current;
+	struct held_lock *hlock, *prev_hlock;
+	struct lock_class_stats *stats;
+	unsigned int depth;
+	int i, point;
+
+	depth = curr->lockdep_depth;
+	if (DEBUG_LOCKS_WARN_ON(!depth))
+		return;
+
+	prev_hlock = NULL;
+	for (i = depth-1; i >= 0; i--) {
+		hlock = curr->held_locks + i;
+		/*
+		 * We must not cross into another context:
+		 */
+		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+			break;
+		if (hlock->instance == lock)
+			goto found_it;
+		prev_hlock = hlock;
+	}
+	print_lock_contention_bug(curr, lock, ip);
+	return;
+
+found_it:
+	hlock->waittime_stamp = sched_clock();
+
+	point = lock_contention_point(hlock->class, ip);
+
+	stats = get_lock_stats(hlock->class);
+	if (point < ARRAY_SIZE(stats->contention_point))
+		stats->contention_point[i]++;
+	put_lock_stats(stats);
+}
+
+static void
+__lock_acquired(struct lockdep_map *lock)
+{
+	struct task_struct *curr = current;
+	struct held_lock *hlock, *prev_hlock;
+	struct lock_class_stats *stats;
+	unsigned int depth;
+	u64 now;
+	s64 waittime;
+	int i;
+
+	depth = curr->lockdep_depth;
+	if (DEBUG_LOCKS_WARN_ON(!depth))
+		return;
+
+	prev_hlock = NULL;
+	for (i = depth-1; i >= 0; i--) {
+		hlock = curr->held_locks + i;
+		/*
+		 * We must not cross into another context:
+		 */
+		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+			break;
+		if (hlock->instance == lock)
+			goto found_it;
+		prev_hlock = hlock;
+	}
+	print_lock_contention_bug(curr, lock, _RET_IP_);
+	return;
+
+found_it:
+	if (!hlock->waittime_stamp)
+		return;
+
+	now = sched_clock();
+	waittime = now - hlock->waittime_stamp;
+	hlock->holdtime_stamp = now;
+
+	stats = get_lock_stats(hlock->class);
+	if (hlock->read)
+		lock_time_inc(&stats->read_waittime, waittime);
+	else
+		lock_time_inc(&stats->write_waittime, waittime);
+	put_lock_stats(stats);
+}
+
+void lock_contended(struct lockdep_map *lock, unsigned long ip)
+{
+	unsigned long flags;
+
+	if (unlikely(!lock_stat))
+		return;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+	current->lockdep_recursion = 1;
+	__lock_contended(lock, ip);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_contended);
+
+void lock_acquired(struct lockdep_map *lock)
+{
+	unsigned long flags;
+
+	if (unlikely(!lock_stat))
+		return;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+	current->lockdep_recursion = 1;
+	__lock_acquired(lock);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_acquired);
+#endif
+
 /*
  * Used by the testsuite, sanitize the validator state
  * after a simulated failure:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2aaa3f98185d..e69179b1809c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -161,6 +161,8 @@ extern ctl_table inotify_table[];
 int sysctl_legacy_va_layout;
 #endif
 
+extern int prove_locking;
+extern int lock_stat;
 
 /* The default sysctl tables: */
 
@@ -282,6 +284,26 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#ifdef CONFIG_PROVE_LOCKING
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "prove_locking",
+		.data		= &prove_locking,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_LOCK_STAT
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "lock_stat",
+		.data		= &lock_stat,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_features",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 640844024ffd..f3e0c2abcbd0 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -283,6 +283,17 @@ config LOCKDEP
 	select KALLSYMS
 	select KALLSYMS_ALL
 
+config LOCK_STAT
+	bool "Lock usage statisitics"
+	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
+	select LOCKDEP
+	select DEBUG_SPINLOCK
+	select DEBUG_MUTEXES
+	select DEBUG_LOCK_ALLOC
+	default n
+	help
+	 This feature enables tracking lock contention points
+
 config DEBUG_LOCKDEP
 	bool "Lock dependency engine debugging"
 	depends on DEBUG_KERNEL && LOCKDEP
-- 
cgit v1.3-8-gc7d7


From 4b32d0a4e9ec07808a5c406a416c6576c986b047 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jul 2007 01:48:59 -0700
Subject: lockdep: various fixes

 - update the copyright notices
 - use the default hash function
 - fix a thinko in a BUILD_BUG_ON
 - add a WARN_ON to spot inconsitent naming
 - fix a termination issue in /proc/lock_stat

[akpm@linux-foundation.org: cleanups]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lockdep.h |  3 ++-
 kernel/lockdep.c        | 21 ++++++++++++---------
 kernel/lockdep_proc.c   |  6 +++++-
 3 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 8f946f614f8e..3d3386b88b6a 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -1,7 +1,8 @@
 /*
  * Runtime locking correctness validator
  *
- *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
  * see Documentation/lockdep-design.txt for more details.
  */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index a8dc99d9fef7..cb64022851c8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -5,7 +5,8 @@
  *
  * Started by Ingo Molnar:
  *
- *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
  * this code maps all the lock dependencies as they occur in a live kernel
  * and will warn about the following classes of locking bugs:
@@ -37,6 +38,7 @@
 #include <linux/debug_locks.h>
 #include <linux/irqflags.h>
 #include <linux/utsname.h>
+#include <linux/hash.h>
 
 #include <asm/sections.h>
 
@@ -238,8 +240,7 @@ LIST_HEAD(all_lock_classes);
  */
 #define CLASSHASH_BITS		(MAX_LOCKDEP_KEYS_BITS - 1)
 #define CLASSHASH_SIZE		(1UL << CLASSHASH_BITS)
-#define CLASSHASH_MASK		(CLASSHASH_SIZE - 1)
-#define __classhashfn(key)	((((unsigned long)key >> CLASSHASH_BITS) + (unsigned long)key) & CLASSHASH_MASK)
+#define __classhashfn(key)	hash_long((unsigned long)key, CLASSHASH_BITS)
 #define classhashentry(key)	(classhash_table + __classhashfn((key)))
 
 static struct list_head classhash_table[CLASSHASH_SIZE];
@@ -250,9 +251,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE];
  */
 #define CHAINHASH_BITS		(MAX_LOCKDEP_CHAINS_BITS-1)
 #define CHAINHASH_SIZE		(1UL << CHAINHASH_BITS)
-#define CHAINHASH_MASK		(CHAINHASH_SIZE - 1)
-#define __chainhashfn(chain) \
-		(((chain >> CHAINHASH_BITS) + chain) & CHAINHASH_MASK)
+#define __chainhashfn(chain)	hash_long(chain, CHAINHASH_BITS)
 #define chainhashentry(chain)	(chainhash_table + __chainhashfn((chain)))
 
 static struct list_head chainhash_table[CHAINHASH_SIZE];
@@ -676,7 +675,8 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	 * (or spin_lock_init()) call - which acts as the key. For static
 	 * locks we use the lock object itself as the key.
 	 */
-	BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class));
+	BUILD_BUG_ON(sizeof(struct lock_class_key) >
+			sizeof(struct lockdep_map));
 
 	key = lock->key->subkeys + subclass;
 
@@ -686,9 +686,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	 * We can walk the hash lockfree, because the hash only
 	 * grows, and we are careful when adding entries to the end:
 	 */
-	list_for_each_entry(class, hash_head, hash_entry)
-		if (class->key == key)
+	list_for_each_entry(class, hash_head, hash_entry) {
+		if (class->key == key) {
+			WARN_ON_ONCE(class->name != lock->name);
 			return class;
+		}
+	}
 
 	return NULL;
 }
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index e682926c9ad6..39163ed1bf0a 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -5,7 +5,8 @@
  *
  * Started by Ingo Molnar:
  *
- *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
  * Code for /proc/lockdep and /proc/lockdep_stats:
  *
@@ -498,6 +499,9 @@ static void *ls_start(struct seq_file *m, loff_t *pos)
 	if (data->iter == data->stats)
 		seq_header(m);
 
+	if (data->iter == data->iter_end)
+		data->iter = NULL;
+
 	return data->iter;
 }
 
-- 
cgit v1.3-8-gc7d7


From 96645678cd726e87ce42a0664de71e047e32bca4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jul 2007 01:49:00 -0700
Subject: lockstat: measure lock bouncing

    __acquire
        |
       lock _____
        |        \
        |    __contended
        |         |
        |        wait
        | _______/
        |/
        |
   __acquired
        |
   __release
        |
     unlock

We measure acquisition and contention bouncing.

This is done by recording a cpu stamp in each lock instance.

Contention bouncing requires the cpu stamp to be set on acquisition. Hence we
move __acquired into the generic path.

__acquired is then used to measure acquisition bouncing by comparing the
current cpu with the old stamp before replacing it.

__contended is used to measure contention bouncing (only useful for preemptable
locks)

[akpm@linux-foundation.org: cleanups]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lockdep.h | 17 ++++++++++++++++-
 kernel/lockdep.c        | 38 ++++++++++++++++++++++++++------------
 kernel/lockdep_proc.c   | 19 ++++++++++++-------
 kernel/mutex.c          |  2 +-
 4 files changed, 55 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 3d3386b88b6a..0e843bf65877 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -130,12 +130,24 @@ struct lock_time {
 	unsigned long			nr;
 };
 
+enum bounce_type {
+	bounce_acquired_write,
+	bounce_acquired_read,
+	bounce_contended_write,
+	bounce_contended_read,
+	nr_bounce_types,
+
+	bounce_acquired = bounce_acquired_write,
+	bounce_contended = bounce_contended_write,
+};
+
 struct lock_class_stats {
 	unsigned long			contention_point[4];
 	struct lock_time		read_waittime;
 	struct lock_time		write_waittime;
 	struct lock_time		read_holdtime;
 	struct lock_time		write_holdtime;
+	unsigned long			bounces[nr_bounce_types];
 };
 
 struct lock_class_stats lock_stats(struct lock_class *class);
@@ -150,6 +162,9 @@ struct lockdep_map {
 	struct lock_class_key		*key;
 	struct lock_class		*class_cache;
 	const char			*name;
+#ifdef CONFIG_LOCK_STAT
+	int				cpu;
+#endif
 };
 
 /*
@@ -321,8 +336,8 @@ do {								\
 	if (!try(_lock)) {					\
 		lock_contended(&(_lock)->dep_map, _RET_IP_);	\
 		lock(_lock);					\
-		lock_acquired(&(_lock)->dep_map);		\
 	}							\
+	lock_acquired(&(_lock)->dep_map);			\
 } while (0)
 
 #else /* CONFIG_LOCK_STAT */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index cb64022851c8..156fce4960c3 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -177,6 +177,9 @@ struct lock_class_stats lock_stats(struct lock_class *class)
 
 		lock_time_add(&pcs->read_holdtime, &stats.read_holdtime);
 		lock_time_add(&pcs->write_holdtime, &stats.write_holdtime);
+
+		for (i = 0; i < ARRAY_SIZE(stats.bounces); i++)
+			stats.bounces[i] += pcs->bounces[i];
 	}
 
 	return stats;
@@ -2325,6 +2328,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 	lock->name = name;
 	lock->key = key;
 	lock->class_cache = NULL;
+#ifdef CONFIG_LOCK_STAT
+	lock->cpu = raw_smp_processor_id();
+#endif
 	if (subclass)
 		register_lock_class(lock, subclass, 1);
 }
@@ -2775,6 +2781,8 @@ found_it:
 	stats = get_lock_stats(hlock->class);
 	if (point < ARRAY_SIZE(stats->contention_point))
 		stats->contention_point[i]++;
+	if (lock->cpu != smp_processor_id())
+		stats->bounces[bounce_contended + !!hlock->read]++;
 	put_lock_stats(stats);
 }
 
@@ -2786,8 +2794,8 @@ __lock_acquired(struct lockdep_map *lock)
 	struct lock_class_stats *stats;
 	unsigned int depth;
 	u64 now;
-	s64 waittime;
-	int i;
+	s64 waittime = 0;
+	int i, cpu;
 
 	depth = curr->lockdep_depth;
 	if (DEBUG_LOCKS_WARN_ON(!depth))
@@ -2809,19 +2817,25 @@ __lock_acquired(struct lockdep_map *lock)
 	return;
 
 found_it:
-	if (!hlock->waittime_stamp)
-		return;
-
-	now = sched_clock();
-	waittime = now - hlock->waittime_stamp;
-	hlock->holdtime_stamp = now;
+	cpu = smp_processor_id();
+	if (hlock->waittime_stamp) {
+		now = sched_clock();
+		waittime = now - hlock->waittime_stamp;
+		hlock->holdtime_stamp = now;
+	}
 
 	stats = get_lock_stats(hlock->class);
-	if (hlock->read)
-		lock_time_inc(&stats->read_waittime, waittime);
-	else
-		lock_time_inc(&stats->write_waittime, waittime);
+	if (waittime) {
+		if (hlock->read)
+			lock_time_inc(&stats->read_waittime, waittime);
+		else
+			lock_time_inc(&stats->write_waittime, waittime);
+	}
+	if (lock->cpu != cpu)
+		stats->bounces[bounce_acquired + !!hlock->read]++;
 	put_lock_stats(stats);
+
+	lock->cpu = cpu;
 }
 
 void lock_contended(struct lockdep_map *lock, unsigned long ip)
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 39163ed1bf0a..7ff80135cbeb 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -430,16 +430,18 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 		else
 			seq_printf(m, "%40s:", name);
 
+		seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]);
 		seq_lock_time(m, &stats->write_waittime);
-		seq_puts(m, " ");
+		seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]);
 		seq_lock_time(m, &stats->write_holdtime);
 		seq_puts(m, "\n");
 	}
 
 	if (stats->read_holdtime.nr) {
 		seq_printf(m, "%38s-R:", name);
+		seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]);
 		seq_lock_time(m, &stats->read_waittime);
-		seq_puts(m, " ");
+		seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]);
 		seq_lock_time(m, &stats->read_holdtime);
 		seq_puts(m, "\n");
 	}
@@ -469,26 +471,29 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 	}
 	if (i) {
 		seq_puts(m, "\n");
-		seq_line(m, '.', 0, 40 + 1 + 8 * (14 + 1));
+		seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
 		seq_puts(m, "\n");
 	}
 }
 
 static void seq_header(struct seq_file *m)
 {
-	seq_printf(m, "lock_stat version 0.1\n");
-	seq_line(m, '-', 0, 40 + 1 + 8 * (14 + 1));
-	seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s\n",
+	seq_printf(m, "lock_stat version 0.2\n");
+	seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
+	seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
+			"%14s %14s\n",
 			"class name",
+			"con-bounces",
 			"contentions",
 			"waittime-min",
 			"waittime-max",
 			"waittime-total",
+			"acq-bounces",
 			"acquisitions",
 			"holdtime-min",
 			"holdtime-max",
 			"holdtime-total");
-	seq_line(m, '-', 0, 40 + 1 + 8 * (14 + 1));
+	seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
 	seq_printf(m, "\n");
 }
 
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 7a3f32761f26..691b86564dd9 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -180,8 +180,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
 		spin_lock_mutex(&lock->wait_lock, flags);
 	}
 
-	lock_acquired(&lock->dep_map);
 done:
+	lock_acquired(&lock->dep_map);
 	/* got the lock - rejoice! */
 	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
 	debug_mutex_set_owner(lock, task_thread_info(task));
-- 
cgit v1.3-8-gc7d7


From 3b5ad0797c0e4049001f961a8b58f1d0ce532072 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 19 Jul 2007 01:49:02 -0700
Subject: stacktrace: fix header file for !CONFIG_STACKTRACE

The print_stack_trace macro in stacktrace.h has a wrong number of
arguments, fix it.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/stacktrace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 1d2b084c0185..e7fa657d0c49 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -13,7 +13,7 @@ extern void save_stack_trace(struct stack_trace *trace);
 extern void print_stack_trace(struct stack_trace *trace, int spaces);
 #else
 # define save_stack_trace(trace)			do { } while (0)
-# define print_stack_trace(trace)			do { } while (0)
+# define print_stack_trace(trace, spaces)		do { } while (0)
 #endif
 
 #endif
-- 
cgit v1.3-8-gc7d7


From d688abf50bd5a30d2c44dea2a72dd59052cd3cce Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 19 Jul 2007 01:49:17 -0700
Subject: move page writeback acounting out of macros

page-writeback accounting is presently performed in the page-flags macros.
This is inconsistent and a bit ugly and makes it awkward to implement
per-backing_dev under-writeback page accounting.

So move this accounting down to the callsite(s).

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 38 ++++++++------------------------------
 mm/page-writeback.c        |  4 ++++
 2 files changed, 12 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a454176c3e30..209d3a47f50f 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -189,37 +189,15 @@ static inline void SetPageUptodate(struct page *page)
 #define __SetPagePrivate(page)  __set_bit(PG_private, &(page)->flags)
 #define __ClearPagePrivate(page) __clear_bit(PG_private, &(page)->flags)
 
+/*
+ * Only test-and-set exist for PG_writeback.  The unconditional operators are
+ * risky: they bypass page accounting.
+ */
 #define PageWriteback(page)	test_bit(PG_writeback, &(page)->flags)
-#define SetPageWriteback(page)						\
-	do {								\
-		if (!test_and_set_bit(PG_writeback,			\
-				&(page)->flags))			\
-			inc_zone_page_state(page, NR_WRITEBACK);	\
-	} while (0)
-#define TestSetPageWriteback(page)					\
-	({								\
-		int ret;						\
-		ret = test_and_set_bit(PG_writeback,			\
-					&(page)->flags);		\
-		if (!ret)						\
-			inc_zone_page_state(page, NR_WRITEBACK);	\
-		ret;							\
-	})
-#define ClearPageWriteback(page)					\
-	do {								\
-		if (test_and_clear_bit(PG_writeback,			\
-				&(page)->flags))			\
-			dec_zone_page_state(page, NR_WRITEBACK);	\
-	} while (0)
-#define TestClearPageWriteback(page)					\
-	({								\
-		int ret;						\
-		ret = test_and_clear_bit(PG_writeback,			\
-				&(page)->flags);			\
-		if (ret)						\
-			dec_zone_page_state(page, NR_WRITEBACK);	\
-		ret;							\
-	})
+#define TestSetPageWriteback(page) test_and_set_bit(PG_writeback,	\
+							&(page)->flags)
+#define TestClearPageWriteback(page) test_and_clear_bit(PG_writeback,	\
+							&(page)->flags)
 
 #define PageBuddy(page)		test_bit(PG_buddy, &(page)->flags)
 #define __SetPageBuddy(page)	__set_bit(PG_buddy, &(page)->flags)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 51b3eb6ab445..63512a9ed57e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -987,6 +987,8 @@ int test_clear_page_writeback(struct page *page)
 	} else {
 		ret = TestClearPageWriteback(page);
 	}
+	if (ret)
+		dec_zone_page_state(page, NR_WRITEBACK);
 	return ret;
 }
 
@@ -1012,6 +1014,8 @@ int test_set_page_writeback(struct page *page)
 	} else {
 		ret = TestSetPageWriteback(page);
 	}
+	if (!ret)
+		inc_zone_page_state(page, NR_WRITEBACK);
 	return ret;
 
 }
-- 
cgit v1.3-8-gc7d7


From e22841c637dc8b308b40f59d64a5b6683d458ab7 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Thu, 19 Jul 2007 01:49:20 -0700
Subject: knfsd: move EX_RDONLY out of header

EX_RDONLY is only called in one place; just put it there.

Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Acked-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/vfs.c               | 12 ++++++++++++
 include/linux/nfsd/export.h | 12 ------------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 5c97d0ea9e22..f2684e57cf22 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1797,6 +1797,18 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
 	return err;
 }
 
+static inline int EX_RDONLY(struct svc_export *exp, struct svc_rqst *rqstp)
+{
+	struct exp_flavor_info *f;
+	struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+
+	for (f = exp->ex_flavors; f < end; f++) {
+		if (f->pseudoflavor == rqstp->rq_flavor)
+			return f->flags & NFSEXP_READONLY;
+	}
+	return exp->ex_flags & NFSEXP_READONLY;
+}
+
 /*
  * Check for a user's access permissions to this inode.
  */
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index 78feb7beff75..fb4e93016666 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -116,18 +116,6 @@ struct svc_expkey {
 #define EX_NOHIDE(exp)		((exp)->ex_flags & NFSEXP_NOHIDE)
 #define EX_WGATHER(exp)		((exp)->ex_flags & NFSEXP_GATHERED_WRITES)
 
-static inline int EX_RDONLY(struct svc_export *exp, struct svc_rqst *rqstp)
-{
-	struct exp_flavor_info *f;
-	struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
-
-	for (f = exp->ex_flavors; f < end; f++) {
-		if (f->pseudoflavor == rqstp->rq_flavor)
-			return f->flags & NFSEXP_READONLY;
-	}
-	return exp->ex_flags & NFSEXP_READONLY;
-}
-
 __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp);
 
 /*
-- 
cgit v1.3-8-gc7d7


From c7d51402d2a64c5b96531f9900bb368020ebbbbb Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Thu, 19 Jul 2007 01:49:20 -0700
Subject: knfsd: clean up EX_RDONLY

Share a little common code, reverse the arguments for consistency, drop the
unnecessary "inline", and lowercase the name.

Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
Acked-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/auth.c              |  3 ++-
 fs/nfsd/vfs.c               | 13 +++----------
 include/linux/nfsd/export.h |  1 +
 3 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index cf61dc8ae942..21928056e35e 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -9,10 +9,11 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcauth.h>
 #include <linux/nfsd/nfsd.h>
+#include <linux/nfsd/export.h>
 
 #define	CAP_NFSD_MASK (CAP_FS_MASK|CAP_TO_MASK(CAP_SYS_RESOURCE))
 
-static int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
+int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
 {
 	struct exp_flavor_info *f;
 	struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f2684e57cf22..ee96a897a29e 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1797,16 +1797,9 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
 	return err;
 }
 
-static inline int EX_RDONLY(struct svc_export *exp, struct svc_rqst *rqstp)
+static int exp_rdonly(struct svc_rqst *rqstp, struct svc_export *exp)
 {
-	struct exp_flavor_info *f;
-	struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
-
-	for (f = exp->ex_flavors; f < end; f++) {
-		if (f->pseudoflavor == rqstp->rq_flavor)
-			return f->flags & NFSEXP_READONLY;
-	}
-	return exp->ex_flags & NFSEXP_READONLY;
+	return nfsexp_flags(rqstp, exp) & NFSEXP_READONLY;
 }
 
 /*
@@ -1845,7 +1838,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 	 */
 	if (!(acc & MAY_LOCAL_ACCESS))
 		if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) {
-			if (EX_RDONLY(exp, rqstp) || IS_RDONLY(inode))
+			if (exp_rdonly(rqstp, exp) || IS_RDONLY(inode))
 				return nfserr_rofs;
 			if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))
 				return nfserr_perm;
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index fb4e93016666..5cd192469096 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -116,6 +116,7 @@ struct svc_expkey {
 #define EX_NOHIDE(exp)		((exp)->ex_flags & NFSEXP_NOHIDE)
 #define EX_WGATHER(exp)		((exp)->ex_flags & NFSEXP_GATHERED_WRITES)
 
+int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp);
 __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp);
 
 /*
-- 
cgit v1.3-8-gc7d7


From 07ad157f6e5d228be78acd5cea0291e5d0360398 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 19 Jul 2007 01:49:22 -0700
Subject: lguest: the guest code

lguest is a simple hypervisor for Linux on Linux.  Unlike kvm it doesn't need
VT/SVM hardware.  Unlike Xen it's simply "modprobe and go".  Unlike both, it's
5000 lines and self-contained.

Performance is ok, but not great (-30% on kernel compile).  But given its
hackability, I expect this to improve, along with the paravirt_ops code which
it supplies a complete example for.  There's also a 64-bit version being
worked on and other craziness.

But most of all, lguest is awesome fun!  Too much of the kernel is a big ball
of hair.  lguest is simple enough to dive into and hack, plus has some warts
which scream "fork me!".

This patch:

This is the code and headers required to make an i386 kernel an lguest guest.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <ak@suse.de>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/lguest/lguest.c     | 544 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/lguest/lguest_asm.S |  53 +++++
 drivers/lguest/lguest_bus.c | 148 ++++++++++++
 include/linux/lguest.h      |  85 +++++++
 include/linux/lguest_bus.h  |  48 ++++
 5 files changed, 878 insertions(+)
 create mode 100644 drivers/lguest/lguest.c
 create mode 100644 drivers/lguest/lguest_asm.S
 create mode 100644 drivers/lguest/lguest_bus.c
 create mode 100644 include/linux/lguest.h
 create mode 100644 include/linux/lguest_bus.h

(limited to 'include/linux')

diff --git a/drivers/lguest/lguest.c b/drivers/lguest/lguest.c
new file mode 100644
index 000000000000..b3a72bd8d6f5
--- /dev/null
+++ b/drivers/lguest/lguest.c
@@ -0,0 +1,544 @@
+/*
+ * Lguest specific paravirt-ops implementation
+ *
+ * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/start_kernel.h>
+#include <linux/string.h>
+#include <linux/console.h>
+#include <linux/screen_info.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/lguest.h>
+#include <linux/lguest_launcher.h>
+#include <linux/lguest_bus.h>
+#include <asm/paravirt.h>
+#include <asm/param.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/e820.h>
+#include <asm/mce.h>
+#include <asm/io.h>
+
+/* Declarations for definitions in lguest_guest.S */
+extern char lguest_noirq_start[], lguest_noirq_end[];
+extern const char lgstart_cli[], lgend_cli[];
+extern const char lgstart_sti[], lgend_sti[];
+extern const char lgstart_popf[], lgend_popf[];
+extern const char lgstart_pushf[], lgend_pushf[];
+extern const char lgstart_iret[], lgend_iret[];
+extern void lguest_iret(void);
+
+struct lguest_data lguest_data = {
+	.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
+	.noirq_start = (u32)lguest_noirq_start,
+	.noirq_end = (u32)lguest_noirq_end,
+	.blocked_interrupts = { 1 }, /* Block timer interrupts */
+};
+struct lguest_device_desc *lguest_devices;
+static __initdata const struct lguest_boot_info *boot = __va(0);
+
+static enum paravirt_lazy_mode lazy_mode;
+static void lguest_lazy_mode(enum paravirt_lazy_mode mode)
+{
+	if (mode == PARAVIRT_LAZY_FLUSH) {
+		if (unlikely(lazy_mode != PARAVIRT_LAZY_NONE))
+			hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+	} else {
+		lazy_mode = mode;
+		if (mode == PARAVIRT_LAZY_NONE)
+			hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+	}
+}
+
+static void lazy_hcall(unsigned long call,
+		       unsigned long arg1,
+		       unsigned long arg2,
+		       unsigned long arg3)
+{
+	if (lazy_mode == PARAVIRT_LAZY_NONE)
+		hcall(call, arg1, arg2, arg3);
+	else
+		async_hcall(call, arg1, arg2, arg3);
+}
+
+void async_hcall(unsigned long call,
+		 unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	/* Note: This code assumes we're uniprocessor. */
+	static unsigned int next_call;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	if (lguest_data.hcall_status[next_call] != 0xFF) {
+		/* Table full, so do normal hcall which will flush table. */
+		hcall(call, arg1, arg2, arg3);
+	} else {
+		lguest_data.hcalls[next_call].eax = call;
+		lguest_data.hcalls[next_call].edx = arg1;
+		lguest_data.hcalls[next_call].ebx = arg2;
+		lguest_data.hcalls[next_call].ecx = arg3;
+		/* Make sure host sees arguments before "valid" flag. */
+		wmb();
+		lguest_data.hcall_status[next_call] = 0;
+		if (++next_call == LHCALL_RING_SIZE)
+			next_call = 0;
+	}
+	local_irq_restore(flags);
+}
+
+void lguest_send_dma(unsigned long key, struct lguest_dma *dma)
+{
+	dma->used_len = 0;
+	hcall(LHCALL_SEND_DMA, key, __pa(dma), 0);
+}
+
+int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas,
+		    unsigned int num, u8 irq)
+{
+	if (!hcall(LHCALL_BIND_DMA, key, __pa(dmas), (num << 8) | irq))
+		return -ENOMEM;
+	return 0;
+}
+
+void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas)
+{
+	hcall(LHCALL_BIND_DMA, key, __pa(dmas), 0);
+}
+
+/* For guests, device memory can be used as normal memory, so we cast away the
+ * __iomem to quieten sparse. */
+void *lguest_map(unsigned long phys_addr, unsigned long pages)
+{
+	return (__force void *)ioremap(phys_addr, PAGE_SIZE*pages);
+}
+
+void lguest_unmap(void *addr)
+{
+	iounmap((__force void __iomem *)addr);
+}
+
+static unsigned long save_fl(void)
+{
+	return lguest_data.irq_enabled;
+}
+
+static void restore_fl(unsigned long flags)
+{
+	/* FIXME: Check if interrupt pending... */
+	lguest_data.irq_enabled = flags;
+}
+
+static void irq_disable(void)
+{
+	lguest_data.irq_enabled = 0;
+}
+
+static void irq_enable(void)
+{
+	/* FIXME: Check if interrupt pending... */
+	lguest_data.irq_enabled = X86_EFLAGS_IF;
+}
+
+static void lguest_write_idt_entry(struct desc_struct *dt,
+				   int entrynum, u32 low, u32 high)
+{
+	write_dt_entry(dt, entrynum, low, high);
+	hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high);
+}
+
+static void lguest_load_idt(const struct Xgt_desc_struct *desc)
+{
+	unsigned int i;
+	struct desc_struct *idt = (void *)desc->address;
+
+	for (i = 0; i < (desc->size+1)/8; i++)
+		hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
+}
+
+static void lguest_load_gdt(const struct Xgt_desc_struct *desc)
+{
+	BUG_ON((desc->size+1)/8 != GDT_ENTRIES);
+	hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);
+}
+
+static void lguest_write_gdt_entry(struct desc_struct *dt,
+				   int entrynum, u32 low, u32 high)
+{
+	write_dt_entry(dt, entrynum, low, high);
+	hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
+}
+
+static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+	lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
+}
+
+static void lguest_set_ldt(const void *addr, unsigned entries)
+{
+}
+
+static void lguest_load_tr_desc(void)
+{
+}
+
+static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
+			 unsigned int *ecx, unsigned int *edx)
+{
+	int function = *eax;
+
+	native_cpuid(eax, ebx, ecx, edx);
+	switch (function) {
+	case 1:	/* Basic feature request. */
+		/* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
+		*ecx &= 0x00002201;
+		/* Similarly: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
+		*edx &= 0x07808101;
+		/* Host wants to know when we flush kernel pages: set PGE. */
+		*edx |= 0x00002000;
+		break;
+	case 0x80000000:
+		/* Futureproof this a little: if they ask how much extended
+		 * processor information, limit it to known fields. */
+		if (*eax > 0x80000008)
+			*eax = 0x80000008;
+		break;
+	}
+}
+
+static unsigned long current_cr0, current_cr3;
+static void lguest_write_cr0(unsigned long val)
+{
+	lazy_hcall(LHCALL_TS, val & 8, 0, 0);
+	current_cr0 = val;
+}
+
+static unsigned long lguest_read_cr0(void)
+{
+	return current_cr0;
+}
+
+static void lguest_clts(void)
+{
+	lazy_hcall(LHCALL_TS, 0, 0, 0);
+	current_cr0 &= ~8U;
+}
+
+static unsigned long lguest_read_cr2(void)
+{
+	return lguest_data.cr2;
+}
+
+static void lguest_write_cr3(unsigned long cr3)
+{
+	lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);
+	current_cr3 = cr3;
+}
+
+static unsigned long lguest_read_cr3(void)
+{
+	return current_cr3;
+}
+
+/* Used to enable/disable PGE, but we don't care. */
+static unsigned long lguest_read_cr4(void)
+{
+	return 0;
+}
+
+static void lguest_write_cr4(unsigned long val)
+{
+}
+
+static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
+			      pte_t *ptep, pte_t pteval)
+{
+	*ptep = pteval;
+	lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low);
+}
+
+/* We only support two-level pagetables at the moment. */
+static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+	*pmdp = pmdval;
+	lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK,
+		   (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);
+}
+
+/* FIXME: Eliminate all callers of this. */
+static void lguest_set_pte(pte_t *ptep, pte_t pteval)
+{
+	*ptep = pteval;
+	/* Don't bother with hypercall before initial setup. */
+	if (current_cr3)
+		lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
+}
+
+static void lguest_flush_tlb_single(unsigned long addr)
+{
+	/* Simply set it to zero, and it will fault back in. */
+	lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0);
+}
+
+static void lguest_flush_tlb_user(void)
+{
+	lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);
+}
+
+static void lguest_flush_tlb_kernel(void)
+{
+	lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
+}
+
+static void disable_lguest_irq(unsigned int irq)
+{
+	set_bit(irq, lguest_data.blocked_interrupts);
+}
+
+static void enable_lguest_irq(unsigned int irq)
+{
+	clear_bit(irq, lguest_data.blocked_interrupts);
+	/* FIXME: If it's pending? */
+}
+
+static struct irq_chip lguest_irq_controller = {
+	.name		= "lguest",
+	.mask		= disable_lguest_irq,
+	.mask_ack	= disable_lguest_irq,
+	.unmask		= enable_lguest_irq,
+};
+
+static void __init lguest_init_IRQ(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < LGUEST_IRQS; i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		if (vector != SYSCALL_VECTOR) {
+			set_intr_gate(vector, interrupt[i]);
+			set_irq_chip_and_handler(i, &lguest_irq_controller,
+						 handle_level_irq);
+		}
+	}
+	irq_ctx_init(smp_processor_id());
+}
+
+static unsigned long lguest_get_wallclock(void)
+{
+	return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0);
+}
+
+static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
+{
+	do_timer(hcall(LHCALL_TIMER_READ, 0, 0, 0));
+	update_process_times(user_mode_vm(get_irq_regs()));
+}
+
+static u64 sched_clock_base;
+static void lguest_time_init(void)
+{
+	set_irq_handler(0, lguest_time_irq);
+	hcall(LHCALL_TIMER_READ, 0, 0, 0);
+	sched_clock_base = jiffies_64;
+	enable_lguest_irq(0);
+}
+
+static unsigned long long lguest_sched_clock(void)
+{
+	return (jiffies_64 - sched_clock_base) * (1000000000 / HZ);
+}
+
+static void lguest_load_esp0(struct tss_struct *tss,
+				     struct thread_struct *thread)
+{
+	lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0,
+		   THREAD_SIZE/PAGE_SIZE);
+}
+
+static void lguest_set_debugreg(int regno, unsigned long value)
+{
+	/* FIXME: Implement */
+}
+
+static void lguest_wbinvd(void)
+{
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static void lguest_apic_write(unsigned long reg, unsigned long v)
+{
+}
+
+static unsigned long lguest_apic_read(unsigned long reg)
+{
+	return 0;
+}
+#endif
+
+static void lguest_safe_halt(void)
+{
+	hcall(LHCALL_HALT, 0, 0, 0);
+}
+
+static void lguest_power_off(void)
+{
+	hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);
+}
+
+static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
+{
+	hcall(LHCALL_CRASH, __pa(p), 0, 0);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block paniced = {
+	.notifier_call = lguest_panic
+};
+
+static __init char *lguest_memory_setup(void)
+{
+	/* We do this here because lockcheck barfs if before start_kernel */
+	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
+
+	e820.nr_map = 0;
+	add_memory_region(0, PFN_PHYS(boot->max_pfn), E820_RAM);
+	return "LGUEST";
+}
+
+static const struct lguest_insns
+{
+	const char *start, *end;
+} lguest_insns[] = {
+	[PARAVIRT_PATCH(irq_disable)] = { lgstart_cli, lgend_cli },
+	[PARAVIRT_PATCH(irq_enable)] = { lgstart_sti, lgend_sti },
+	[PARAVIRT_PATCH(restore_fl)] = { lgstart_popf, lgend_popf },
+	[PARAVIRT_PATCH(save_fl)] = { lgstart_pushf, lgend_pushf },
+};
+static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len)
+{
+	unsigned int insn_len;
+
+	/* Don't touch it if we don't have a replacement */
+	if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
+		return paravirt_patch_default(type, clobber, insns, len);
+
+	insn_len = lguest_insns[type].end - lguest_insns[type].start;
+
+	/* Similarly if we can't fit replacement. */
+	if (len < insn_len)
+		return paravirt_patch_default(type, clobber, insns, len);
+
+	memcpy(insns, lguest_insns[type].start, insn_len);
+	return insn_len;
+}
+
+__init void lguest_init(void)
+{
+	paravirt_ops.name = "lguest";
+	paravirt_ops.paravirt_enabled = 1;
+	paravirt_ops.kernel_rpl = 1;
+
+	paravirt_ops.save_fl = save_fl;
+	paravirt_ops.restore_fl = restore_fl;
+	paravirt_ops.irq_disable = irq_disable;
+	paravirt_ops.irq_enable = irq_enable;
+	paravirt_ops.load_gdt = lguest_load_gdt;
+	paravirt_ops.memory_setup = lguest_memory_setup;
+	paravirt_ops.cpuid = lguest_cpuid;
+	paravirt_ops.write_cr3 = lguest_write_cr3;
+	paravirt_ops.flush_tlb_user = lguest_flush_tlb_user;
+	paravirt_ops.flush_tlb_single = lguest_flush_tlb_single;
+	paravirt_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
+	paravirt_ops.set_pte = lguest_set_pte;
+	paravirt_ops.set_pte_at = lguest_set_pte_at;
+	paravirt_ops.set_pmd = lguest_set_pmd;
+#ifdef CONFIG_X86_LOCAL_APIC
+	paravirt_ops.apic_write = lguest_apic_write;
+	paravirt_ops.apic_write_atomic = lguest_apic_write;
+	paravirt_ops.apic_read = lguest_apic_read;
+#endif
+	paravirt_ops.load_idt = lguest_load_idt;
+	paravirt_ops.iret = lguest_iret;
+	paravirt_ops.load_esp0 = lguest_load_esp0;
+	paravirt_ops.load_tr_desc = lguest_load_tr_desc;
+	paravirt_ops.set_ldt = lguest_set_ldt;
+	paravirt_ops.load_tls = lguest_load_tls;
+	paravirt_ops.set_debugreg = lguest_set_debugreg;
+	paravirt_ops.clts = lguest_clts;
+	paravirt_ops.read_cr0 = lguest_read_cr0;
+	paravirt_ops.write_cr0 = lguest_write_cr0;
+	paravirt_ops.init_IRQ = lguest_init_IRQ;
+	paravirt_ops.read_cr2 = lguest_read_cr2;
+	paravirt_ops.read_cr3 = lguest_read_cr3;
+	paravirt_ops.read_cr4 = lguest_read_cr4;
+	paravirt_ops.write_cr4 = lguest_write_cr4;
+	paravirt_ops.write_gdt_entry = lguest_write_gdt_entry;
+	paravirt_ops.write_idt_entry = lguest_write_idt_entry;
+	paravirt_ops.patch = lguest_patch;
+	paravirt_ops.safe_halt = lguest_safe_halt;
+	paravirt_ops.get_wallclock = lguest_get_wallclock;
+	paravirt_ops.time_init = lguest_time_init;
+	paravirt_ops.set_lazy_mode = lguest_lazy_mode;
+	paravirt_ops.wbinvd = lguest_wbinvd;
+	paravirt_ops.sched_clock = lguest_sched_clock;
+
+	hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
+	strncpy(boot_command_line, boot->cmdline, COMMAND_LINE_SIZE);
+
+	/* We use top of mem for initial pagetables. */
+	init_pg_tables_end = __pa(pg0);
+
+	asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
+
+	reserve_top_address(lguest_data.reserve_mem);
+
+	lockdep_init();
+
+	paravirt_disable_iospace();
+
+	cpu_detect(&new_cpu_data);
+	/* head.S usually sets up the first capability word, so do it here. */
+	new_cpu_data.x86_capability[0] = cpuid_edx(1);
+
+	/* Math is always hard! */
+	new_cpu_data.hard_math = 1;
+
+#ifdef CONFIG_X86_MCE
+	mce_disabled = 1;
+#endif
+
+#ifdef CONFIG_ACPI
+	acpi_disabled = 1;
+	acpi_ht = 0;
+#endif
+
+	add_preferred_console("hvc", 0, NULL);
+
+	if (boot->initrd_size) {
+		/* We stash this at top of memory. */
+		INITRD_START = boot->max_pfn*PAGE_SIZE - boot->initrd_size;
+		INITRD_SIZE = boot->initrd_size;
+		LOADER_TYPE = 0xFF;
+	}
+
+	pm_power_off = lguest_power_off;
+	start_kernel();
+}
diff --git a/drivers/lguest/lguest_asm.S b/drivers/lguest/lguest_asm.S
new file mode 100644
index 000000000000..5ac3d20bb184
--- /dev/null
+++ b/drivers/lguest/lguest_asm.S
@@ -0,0 +1,53 @@
+#include <linux/linkage.h>
+#include <linux/lguest.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+
+/* FIXME: Once asm/processor-flags.h goes in, include that */
+#define X86_EFLAGS_IF 0x00000200
+
+/*
+ * This is where we begin: we have a magic signature which the launcher looks
+ * for.  The plan is that the Linux boot protocol will be extended with a
+ * "platform type" field which will guide us here from the normal entry point,
+ * but for the moment this suffices.
+ *
+ * We put it in .init.text will be discarded after boot.
+ */
+.section .init.text, "ax", @progbits
+.ascii "GenuineLguest"
+	/* Set up initial stack. */
+ 	movl $(init_thread_union+THREAD_SIZE),%esp
+	jmp lguest_init
+
+/* The templates for inline patching. */
+#define LGUEST_PATCH(name, insns...)			\
+	lgstart_##name:	insns; lgend_##name:;		\
+	.globl lgstart_##name; .globl lgend_##name
+
+LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
+LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled)
+LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled)
+LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
+
+.text
+/* These demark the EIP range where host should never deliver interrupts. */
+.global lguest_noirq_start
+.global lguest_noirq_end
+
+/*
+ * We move eflags word to lguest_data.irq_enabled to restore interrupt state.
+ * For page faults, gpfs and virtual interrupts, the hypervisor has saved
+ * eflags manually, otherwise it was delivered directly and so eflags reflects
+ * the real machine IF state, ie. interrupts on.  Since the kernel always dies
+ * if it takes such a trap with interrupts disabled anyway, turning interrupts
+ * back on unconditionally here is OK.
+ */
+ENTRY(lguest_iret)
+	pushl	%eax
+	movl	12(%esp), %eax
+lguest_noirq_start:
+	movl	%eax,%ss:lguest_data+LGUEST_DATA_irq_enabled
+	popl	%eax
+	iret
+lguest_noirq_end:
diff --git a/drivers/lguest/lguest_bus.c b/drivers/lguest/lguest_bus.c
new file mode 100644
index 000000000000..18d6ab21a43b
--- /dev/null
+++ b/drivers/lguest/lguest_bus.c
@@ -0,0 +1,148 @@
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/lguest_bus.h>
+#include <asm/io.h>
+
+static ssize_t type_show(struct device *_dev,
+                         struct device_attribute *attr, char *buf)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	return sprintf(buf, "%hu", lguest_devices[dev->index].type);
+}
+static ssize_t features_show(struct device *_dev,
+                             struct device_attribute *attr, char *buf)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	return sprintf(buf, "%hx", lguest_devices[dev->index].features);
+}
+static ssize_t pfn_show(struct device *_dev,
+			 struct device_attribute *attr, char *buf)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	return sprintf(buf, "%u", lguest_devices[dev->index].pfn);
+}
+static ssize_t status_show(struct device *_dev,
+                           struct device_attribute *attr, char *buf)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	return sprintf(buf, "%hx", lguest_devices[dev->index].status);
+}
+static ssize_t status_store(struct device *_dev, struct device_attribute *attr,
+                            const char *buf, size_t count)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	if (sscanf(buf, "%hi", &lguest_devices[dev->index].status) != 1)
+		return -EINVAL;
+	return count;
+}
+static struct device_attribute lguest_dev_attrs[] = {
+	__ATTR_RO(type),
+	__ATTR_RO(features),
+	__ATTR_RO(pfn),
+	__ATTR(status, 0644, status_show, status_store),
+	__ATTR_NULL
+};
+
+static int lguest_dev_match(struct device *_dev, struct device_driver *_drv)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	struct lguest_driver *drv = container_of(_drv,struct lguest_driver,drv);
+
+	return (drv->device_type == lguest_devices[dev->index].type);
+}
+
+struct lguest_bus {
+	struct bus_type bus;
+	struct device dev;
+};
+
+static struct lguest_bus lguest_bus = {
+	.bus = {
+		.name  = "lguest",
+		.match = lguest_dev_match,
+		.dev_attrs = lguest_dev_attrs,
+	},
+	.dev = {
+		.parent = NULL,
+		.bus_id = "lguest",
+	}
+};
+
+static int lguest_dev_probe(struct device *_dev)
+{
+	int ret;
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	struct lguest_driver *drv = container_of(dev->dev.driver,
+						struct lguest_driver, drv);
+
+	lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER;
+	ret = drv->probe(dev);
+	if (ret == 0)
+		lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER_OK;
+	return ret;
+}
+
+int register_lguest_driver(struct lguest_driver *drv)
+{
+	if (!lguest_devices)
+		return 0;
+
+	drv->drv.bus = &lguest_bus.bus;
+	drv->drv.name = drv->name;
+	drv->drv.owner = drv->owner;
+	drv->drv.probe = lguest_dev_probe;
+
+	return driver_register(&drv->drv);
+}
+EXPORT_SYMBOL_GPL(register_lguest_driver);
+
+static void add_lguest_device(unsigned int index)
+{
+	struct lguest_device *new;
+
+	lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE;
+	new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL);
+	if (!new) {
+		printk(KERN_EMERG "Cannot allocate lguest device %u\n", index);
+		lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED;
+		return;
+	}
+
+	new->index = index;
+	new->private = NULL;
+	memset(&new->dev, 0, sizeof(new->dev));
+	new->dev.parent = &lguest_bus.dev;
+	new->dev.bus = &lguest_bus.bus;
+	sprintf(new->dev.bus_id, "%u", index);
+	if (device_register(&new->dev) != 0) {
+		printk(KERN_EMERG "Cannot register lguest device %u\n", index);
+		lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED;
+		kfree(new);
+	}
+}
+
+static void scan_devices(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < LGUEST_MAX_DEVICES; i++)
+		if (lguest_devices[i].type)
+			add_lguest_device(i);
+}
+
+static int __init lguest_bus_init(void)
+{
+	if (strcmp(paravirt_ops.name, "lguest") != 0)
+		return 0;
+
+	/* Devices are in page above top of "normal" mem. */
+	lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1);
+
+	if (bus_register(&lguest_bus.bus) != 0
+	    || device_register(&lguest_bus.dev) != 0)
+		panic("lguest bus registration failed");
+
+	scan_devices();
+	return 0;
+}
+postcore_initcall(lguest_bus_init);
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
new file mode 100644
index 000000000000..f30c04fc22b7
--- /dev/null
+++ b/include/linux/lguest.h
@@ -0,0 +1,85 @@
+/* Things the lguest guest needs to know.  Note: like all lguest interfaces,
+ * this is subject to wild and random change between versions. */
+#ifndef _ASM_LGUEST_H
+#define _ASM_LGUEST_H
+
+/* These are randomly chosen numbers which indicate we're an lguest at boot */
+#define LGUEST_MAGIC_EBP 0x4C687970
+#define LGUEST_MAGIC_EDI 0x652D4D65
+#define LGUEST_MAGIC_ESI 0xFFFFFFFF
+
+#ifndef __ASSEMBLY__
+#include <asm/irq.h>
+
+#define LHCALL_FLUSH_ASYNC	0
+#define LHCALL_LGUEST_INIT	1
+#define LHCALL_CRASH		2
+#define LHCALL_LOAD_GDT		3
+#define LHCALL_NEW_PGTABLE	4
+#define LHCALL_FLUSH_TLB	5
+#define LHCALL_LOAD_IDT_ENTRY	6
+#define LHCALL_SET_STACK	7
+#define LHCALL_TS		8
+#define LHCALL_TIMER_READ	9
+#define LHCALL_HALT		10
+#define LHCALL_GET_WALLCLOCK	11
+#define LHCALL_BIND_DMA		12
+#define LHCALL_SEND_DMA		13
+#define LHCALL_SET_PTE		14
+#define LHCALL_SET_PMD		15
+#define LHCALL_LOAD_TLS		16
+
+#define LGUEST_TRAP_ENTRY 0x1F
+
+static inline unsigned long
+hcall(unsigned long call,
+      unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
+		     : "=a"(call)
+		     : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
+		     : "memory");
+	return call;
+}
+
+void async_hcall(unsigned long call,
+		 unsigned long arg1, unsigned long arg2, unsigned long arg3);
+
+/* Can't use our min() macro here: needs to be a constant */
+#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
+
+#define LHCALL_RING_SIZE 64
+struct hcall_ring
+{
+	u32 eax, edx, ebx, ecx;
+};
+
+/* All the good stuff happens here: guest registers it with LGUEST_INIT */
+struct lguest_data
+{
+/* Fields which change during running: */
+	/* 512 == enabled (same as eflags) */
+	unsigned int irq_enabled;
+	/* Interrupts blocked by guest. */
+	DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS);
+
+	/* Virtual address of page fault. */
+	unsigned long cr2;
+
+	/* Async hypercall ring.  0xFF == done, 0 == pending. */
+	u8 hcall_status[LHCALL_RING_SIZE];
+	struct hcall_ring hcalls[LHCALL_RING_SIZE];
+
+/* Fields initialized by the hypervisor at boot: */
+	/* Memory not to try to access */
+	unsigned long reserve_mem;
+	/* ID of this guest (used by network driver to set ethernet address) */
+	u16 guestid;
+
+/* Fields initialized by the guest at boot: */
+	/* Instruction range to suppress interrupts even if enabled */
+	unsigned long noirq_start, noirq_end;
+};
+extern struct lguest_data lguest_data;
+#endif /* __ASSEMBLY__ */
+#endif	/* _ASM_LGUEST_H */
diff --git a/include/linux/lguest_bus.h b/include/linux/lguest_bus.h
new file mode 100644
index 000000000000..c9b4e05fee49
--- /dev/null
+++ b/include/linux/lguest_bus.h
@@ -0,0 +1,48 @@
+#ifndef _ASM_LGUEST_DEVICE_H
+#define _ASM_LGUEST_DEVICE_H
+/* Everything you need to know about lguest devices. */
+#include <linux/device.h>
+#include <linux/lguest.h>
+#include <linux/lguest_launcher.h>
+
+struct lguest_device {
+	/* Unique busid, and index into lguest_page->devices[] */
+	unsigned int index;
+
+	struct device dev;
+
+	/* Driver can hang data off here. */
+	void *private;
+};
+
+/* By convention, each device can use irq index+1 if it wants to. */
+static inline int lgdev_irq(const struct lguest_device *dev)
+{
+	return dev->index + 1;
+}
+
+/* dma args must not be vmalloced! */
+void lguest_send_dma(unsigned long key, struct lguest_dma *dma);
+int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas,
+		    unsigned int num, u8 irq);
+void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas);
+
+/* Map the virtual device space */
+void *lguest_map(unsigned long phys_addr, unsigned long pages);
+void lguest_unmap(void *);
+
+struct lguest_driver {
+	const char *name;
+	struct module *owner;
+	u16 device_type;
+	int (*probe)(struct lguest_device *dev);
+	void (*remove)(struct lguest_device *dev);
+
+	struct device_driver drv;
+};
+
+extern int register_lguest_driver(struct lguest_driver *drv);
+extern void unregister_lguest_driver(struct lguest_driver *drv);
+
+extern struct lguest_device_desc *lguest_devices; /* Just past max_pfn */
+#endif /* _ASM_LGUEST_DEVICE_H */
-- 
cgit v1.3-8-gc7d7


From d7e28ffe6c74416b54345d6004fd0964c115b12c Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 19 Jul 2007 01:49:23 -0700
Subject: lguest: the host code

This is the code for the "lg.ko" module, which allows lguest guests to
be launched.

[akpm@linux-foundation.org: update for futex-new-private-futexes]
[akpm@linux-foundation.org: build fix]
[jmorris@namei.org: lguest: use hrtimers]
[akpm@linux-foundation.org: x86_64 build fix]
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <ak@suse.de>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/kernel/tsc.c                |   4 +-
 arch/x86_64/kernel/tsc.c              |   2 +-
 drivers/lguest/core.c                 | 462 ++++++++++++++++++++++++++++++++++
 drivers/lguest/hypercalls.c           | 192 ++++++++++++++
 drivers/lguest/interrupts_and_traps.c | 268 ++++++++++++++++++++
 drivers/lguest/io.c                   | 399 +++++++++++++++++++++++++++++
 drivers/lguest/lg.h                   | 261 +++++++++++++++++++
 drivers/lguest/lguest.c               | 125 +++++++--
 drivers/lguest/lguest_asm.S           |   5 +-
 drivers/lguest/lguest_user.c          | 236 +++++++++++++++++
 drivers/lguest/page_tables.c          | 411 ++++++++++++++++++++++++++++++
 drivers/lguest/segments.c             | 125 +++++++++
 drivers/lguest/switcher.S             | 159 ++++++++++++
 include/asm-i386/tsc.h                |   1 +
 include/linux/lguest.h                |  12 +-
 include/linux/lguest_launcher.h       |  73 ++++++
 kernel/fork.c                         |   1 -
 17 files changed, 2702 insertions(+), 34 deletions(-)
 create mode 100644 drivers/lguest/core.c
 create mode 100644 drivers/lguest/hypercalls.c
 create mode 100644 drivers/lguest/interrupts_and_traps.c
 create mode 100644 drivers/lguest/io.c
 create mode 100644 drivers/lguest/lg.h
 create mode 100644 drivers/lguest/lguest_user.c
 create mode 100644 drivers/lguest/page_tables.c
 create mode 100644 drivers/lguest/segments.c
 create mode 100644 drivers/lguest/switcher.S
 create mode 100644 include/linux/lguest_launcher.h

(limited to 'include/linux')

diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 252f9010f283..debd7dbb4158 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -27,6 +27,7 @@ static int tsc_enabled;
  * an extra value to store the TSC freq
  */
 unsigned int tsc_khz;
+EXPORT_SYMBOL_GPL(tsc_khz);
 
 int tsc_disable;
 
@@ -58,10 +59,11 @@ __setup("notsc", tsc_setup);
  */
 static int tsc_unstable;
 
-static inline int check_tsc_unstable(void)
+int check_tsc_unstable(void)
 {
 	return tsc_unstable;
 }
+EXPORT_SYMBOL_GPL(check_tsc_unstable);
 
 /* Accellerators for sched_clock()
  * convert from cycles(64bits) => nanoseconds (64bits)
diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c
index 48f9a8e6aa91..e850aa01e1b3 100644
--- a/arch/x86_64/kernel/tsc.c
+++ b/arch/x86_64/kernel/tsc.c
@@ -44,7 +44,7 @@ unsigned long long sched_clock(void)
 
 static int tsc_unstable;
 
-static inline int check_tsc_unstable(void)
+inline int check_tsc_unstable(void)
 {
 	return tsc_unstable;
 }
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
new file mode 100644
index 000000000000..ce909ec57499
--- /dev/null
+++ b/drivers/lguest/core.c
@@ -0,0 +1,462 @@
+/* World's simplest hypervisor, to test paravirt_ops and show
+ * unbelievers that virtualization is the future.  Plus, it's fun! */
+#include <linux/module.h>
+#include <linux/stringify.h>
+#include <linux/stddef.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/cpu.h>
+#include <linux/freezer.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/poll.h>
+#include <asm/highmem.h>
+#include <asm/asm-offsets.h>
+#include <asm/i387.h>
+#include "lg.h"
+
+/* Found in switcher.S */
+extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
+extern unsigned long default_idt_entries[];
+
+/* Every guest maps the core switcher code. */
+#define SHARED_SWITCHER_PAGES \
+	DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
+/* Pages for switcher itself, then two pages per cpu */
+#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
+
+/* We map at -4M for ease of mapping into the guest (one PTE page). */
+#define SWITCHER_ADDR 0xFFC00000
+
+static struct vm_struct *switcher_vma;
+static struct page **switcher_page;
+
+static int cpu_had_pge;
+static struct {
+	unsigned long offset;
+	unsigned short segment;
+} lguest_entry;
+
+/* This One Big lock protects all inter-guest data structures. */
+DEFINE_MUTEX(lguest_lock);
+static DEFINE_PER_CPU(struct lguest *, last_guest);
+
+/* FIXME: Make dynamic. */
+#define MAX_LGUEST_GUESTS 16
+struct lguest lguests[MAX_LGUEST_GUESTS];
+
+/* Offset from where switcher.S was compiled to where we've copied it */
+static unsigned long switcher_offset(void)
+{
+	return SWITCHER_ADDR - (unsigned long)start_switcher_text;
+}
+
+/* This cpu's struct lguest_pages. */
+static struct lguest_pages *lguest_pages(unsigned int cpu)
+{
+	return &(((struct lguest_pages *)
+		  (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
+}
+
+static __init int map_switcher(void)
+{
+	int i, err;
+	struct page **pagep;
+
+	switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES,
+				GFP_KERNEL);
+	if (!switcher_page) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
+		unsigned long addr = get_zeroed_page(GFP_KERNEL);
+		if (!addr) {
+			err = -ENOMEM;
+			goto free_some_pages;
+		}
+		switcher_page[i] = virt_to_page(addr);
+	}
+
+	switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
+				       VM_ALLOC, SWITCHER_ADDR, VMALLOC_END);
+	if (!switcher_vma) {
+		err = -ENOMEM;
+		printk("lguest: could not map switcher pages high\n");
+		goto free_pages;
+	}
+
+	pagep = switcher_page;
+	err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep);
+	if (err) {
+		printk("lguest: map_vm_area failed: %i\n", err);
+		goto free_vma;
+	}
+	memcpy(switcher_vma->addr, start_switcher_text,
+	       end_switcher_text - start_switcher_text);
+
+	/* Fix up IDT entries to point into copied text. */
+	for (i = 0; i < IDT_ENTRIES; i++)
+		default_idt_entries[i] += switcher_offset();
+
+	for_each_possible_cpu(i) {
+		struct lguest_pages *pages = lguest_pages(i);
+		struct lguest_ro_state *state = &pages->state;
+
+		/* These fields are static: rest done in copy_in_guest_info */
+		state->host_gdt_desc.size = GDT_SIZE-1;
+		state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
+		store_idt(&state->host_idt_desc);
+		state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
+		state->guest_idt_desc.address = (long)&state->guest_idt;
+		state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
+		state->guest_gdt_desc.address = (long)&state->guest_gdt;
+		state->guest_tss.esp0 = (long)(&pages->regs + 1);
+		state->guest_tss.ss0 = LGUEST_DS;
+		/* No I/O for you! */
+		state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
+		setup_default_gdt_entries(state);
+		setup_default_idt_entries(state, default_idt_entries);
+
+		/* Setup LGUEST segments on all cpus */
+		get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+		get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+	}
+
+	/* Initialize entry point into switcher. */
+	lguest_entry.offset = (long)switch_to_guest + switcher_offset();
+	lguest_entry.segment = LGUEST_CS;
+
+	printk(KERN_INFO "lguest: mapped switcher at %p\n",
+	       switcher_vma->addr);
+	return 0;
+
+free_vma:
+	vunmap(switcher_vma->addr);
+free_pages:
+	i = TOTAL_SWITCHER_PAGES;
+free_some_pages:
+	for (--i; i >= 0; i--)
+		__free_pages(switcher_page[i], 0);
+	kfree(switcher_page);
+out:
+	return err;
+}
+
+static void unmap_switcher(void)
+{
+	unsigned int i;
+
+	vunmap(switcher_vma->addr);
+	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
+		__free_pages(switcher_page[i], 0);
+}
+
+/* IN/OUT insns: enough to get us past boot-time probing. */
+static int emulate_insn(struct lguest *lg)
+{
+	u8 insn;
+	unsigned int insnlen = 0, in = 0, shift = 0;
+	unsigned long physaddr = guest_pa(lg, lg->regs->eip);
+
+	/* This only works for addresses in linear mapping... */
+	if (lg->regs->eip < lg->page_offset)
+		return 0;
+	lgread(lg, &insn, physaddr, 1);
+
+	/* Operand size prefix means it's actually for ax. */
+	if (insn == 0x66) {
+		shift = 16;
+		insnlen = 1;
+		lgread(lg, &insn, physaddr + insnlen, 1);
+	}
+
+	switch (insn & 0xFE) {
+	case 0xE4: /* in     <next byte>,%al */
+		insnlen += 2;
+		in = 1;
+		break;
+	case 0xEC: /* in     (%dx),%al */
+		insnlen += 1;
+		in = 1;
+		break;
+	case 0xE6: /* out    %al,<next byte> */
+		insnlen += 2;
+		break;
+	case 0xEE: /* out    %al,(%dx) */
+		insnlen += 1;
+		break;
+	default:
+		return 0;
+	}
+
+	if (in) {
+		/* Lower bit tells is whether it's a 16 or 32 bit access */
+		if (insn & 0x1)
+			lg->regs->eax = 0xFFFFFFFF;
+		else
+			lg->regs->eax |= (0xFFFF << shift);
+	}
+	lg->regs->eip += insnlen;
+	return 1;
+}
+
+int lguest_address_ok(const struct lguest *lg,
+		      unsigned long addr, unsigned long len)
+{
+	return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr);
+}
+
+/* Just like get_user, but don't let guest access lguest binary. */
+u32 lgread_u32(struct lguest *lg, unsigned long addr)
+{
+	u32 val = 0;
+
+	/* Don't let them access lguest binary */
+	if (!lguest_address_ok(lg, addr, sizeof(val))
+	    || get_user(val, (u32 __user *)addr) != 0)
+		kill_guest(lg, "bad read address %#lx", addr);
+	return val;
+}
+
+void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val)
+{
+	if (!lguest_address_ok(lg, addr, sizeof(val))
+	    || put_user(val, (u32 __user *)addr) != 0)
+		kill_guest(lg, "bad write address %#lx", addr);
+}
+
+void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes)
+{
+	if (!lguest_address_ok(lg, addr, bytes)
+	    || copy_from_user(b, (void __user *)addr, bytes) != 0) {
+		/* copy_from_user should do this, but as we rely on it... */
+		memset(b, 0, bytes);
+		kill_guest(lg, "bad read address %#lx len %u", addr, bytes);
+	}
+}
+
+void lgwrite(struct lguest *lg, unsigned long addr, const void *b,
+	     unsigned bytes)
+{
+	if (!lguest_address_ok(lg, addr, bytes)
+	    || copy_to_user((void __user *)addr, b, bytes) != 0)
+		kill_guest(lg, "bad write address %#lx len %u", addr, bytes);
+}
+
+static void set_ts(void)
+{
+	u32 cr0;
+
+	cr0 = read_cr0();
+	if (!(cr0 & 8))
+		write_cr0(cr0|8);
+}
+
+static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
+{
+	if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
+		__get_cpu_var(last_guest) = lg;
+		lg->last_pages = pages;
+		lg->changed = CHANGED_ALL;
+	}
+
+	/* These are pretty cheap, so we do them unconditionally. */
+	pages->state.host_cr3 = __pa(current->mm->pgd);
+	map_switcher_in_guest(lg, pages);
+	pages->state.guest_tss.esp1 = lg->esp1;
+	pages->state.guest_tss.ss1 = lg->ss1;
+
+	/* Copy direct trap entries. */
+	if (lg->changed & CHANGED_IDT)
+		copy_traps(lg, pages->state.guest_idt, default_idt_entries);
+
+	/* Copy all GDT entries but the TSS. */
+	if (lg->changed & CHANGED_GDT)
+		copy_gdt(lg, pages->state.guest_gdt);
+	/* If only the TLS entries have changed, copy them. */
+	else if (lg->changed & CHANGED_GDT_TLS)
+		copy_gdt_tls(lg, pages->state.guest_gdt);
+
+	lg->changed = 0;
+}
+
+static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
+{
+	unsigned int clobber;
+
+	copy_in_guest_info(lg, pages);
+
+	/* Put eflags on stack, lcall does rest: suitable for iret return. */
+	asm volatile("pushf; lcall *lguest_entry"
+		     : "=a"(clobber), "=b"(clobber)
+		     : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
+		     : "memory", "%edx", "%ecx", "%edi", "%esi");
+}
+
+int run_guest(struct lguest *lg, unsigned long __user *user)
+{
+	while (!lg->dead) {
+		unsigned int cr2 = 0; /* Damn gcc */
+
+		/* Hypercalls first: we might have been out to userspace */
+		do_hypercalls(lg);
+		if (lg->dma_is_pending) {
+			if (put_user(lg->pending_dma, user) ||
+			    put_user(lg->pending_key, user+1))
+				return -EFAULT;
+			return sizeof(unsigned long)*2;
+		}
+
+		if (signal_pending(current))
+			return -ERESTARTSYS;
+
+		/* If Waker set break_out, return to Launcher. */
+		if (lg->break_out)
+			return -EAGAIN;
+
+		maybe_do_interrupt(lg);
+
+		try_to_freeze();
+
+		if (lg->dead)
+			break;
+
+		if (lg->halted) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+			continue;
+		}
+
+		local_irq_disable();
+
+		/* Even if *we* don't want FPU trap, guest might... */
+		if (lg->ts)
+			set_ts();
+
+		/* Don't let Guest do SYSENTER: we can't handle it. */
+		if (boot_cpu_has(X86_FEATURE_SEP))
+			wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
+
+		run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
+
+		/* Save cr2 now if we page-faulted. */
+		if (lg->regs->trapnum == 14)
+			cr2 = read_cr2();
+		else if (lg->regs->trapnum == 7)
+			math_state_restore();
+
+		if (boot_cpu_has(X86_FEATURE_SEP))
+			wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
+		local_irq_enable();
+
+		switch (lg->regs->trapnum) {
+		case 13: /* We've intercepted a GPF. */
+			if (lg->regs->errcode == 0) {
+				if (emulate_insn(lg))
+					continue;
+			}
+			break;
+		case 14: /* We've intercepted a page fault. */
+			if (demand_page(lg, cr2, lg->regs->errcode))
+				continue;
+
+			/* If lguest_data is NULL, this won't hurt. */
+			if (put_user(cr2, &lg->lguest_data->cr2))
+				kill_guest(lg, "Writing cr2");
+			break;
+		case 7: /* We've intercepted a Device Not Available fault. */
+			/* If they don't want to know, just absorb it. */
+			if (!lg->ts)
+				continue;
+			break;
+		case 32 ... 255: /* Real interrupt, fall thru */
+			cond_resched();
+		case LGUEST_TRAP_ENTRY: /* Handled at top of loop */
+			continue;
+		}
+
+		if (deliver_trap(lg, lg->regs->trapnum))
+			continue;
+
+		kill_guest(lg, "unhandled trap %li at %#lx (%#lx)",
+			   lg->regs->trapnum, lg->regs->eip,
+			   lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode);
+	}
+	return -ENOENT;
+}
+
+int find_free_guest(void)
+{
+	unsigned int i;
+	for (i = 0; i < MAX_LGUEST_GUESTS; i++)
+		if (!lguests[i].tsk)
+			return i;
+	return -1;
+}
+
+static void adjust_pge(void *on)
+{
+	if (on)
+		write_cr4(read_cr4() | X86_CR4_PGE);
+	else
+		write_cr4(read_cr4() & ~X86_CR4_PGE);
+}
+
+static int __init init(void)
+{
+	int err;
+
+	if (paravirt_enabled()) {
+		printk("lguest is afraid of %s\n", paravirt_ops.name);
+		return -EPERM;
+	}
+
+	err = map_switcher();
+	if (err)
+		return err;
+
+	err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
+	if (err) {
+		unmap_switcher();
+		return err;
+	}
+	lguest_io_init();
+
+	err = lguest_device_init();
+	if (err) {
+		free_pagetables();
+		unmap_switcher();
+		return err;
+	}
+	lock_cpu_hotplug();
+	if (cpu_has_pge) { /* We have a broader idea of "global". */
+		cpu_had_pge = 1;
+		on_each_cpu(adjust_pge, (void *)0, 0, 1);
+		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+	}
+	unlock_cpu_hotplug();
+	return 0;
+}
+
+static void __exit fini(void)
+{
+	lguest_device_remove();
+	free_pagetables();
+	unmap_switcher();
+	lock_cpu_hotplug();
+	if (cpu_had_pge) {
+		set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+		on_each_cpu(adjust_pge, (void *)1, 0, 1);
+	}
+	unlock_cpu_hotplug();
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
new file mode 100644
index 000000000000..ea52ca451f74
--- /dev/null
+++ b/drivers/lguest/hypercalls.c
@@ -0,0 +1,192 @@
+/*  Actual hypercalls, which allow guests to actually do something.
+    Copyright (C) 2006 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+*/
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <irq_vectors.h>
+#include "lg.h"
+
+static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
+{
+	switch (regs->eax) {
+	case LHCALL_FLUSH_ASYNC:
+		break;
+	case LHCALL_LGUEST_INIT:
+		kill_guest(lg, "already have lguest_data");
+		break;
+	case LHCALL_CRASH: {
+		char msg[128];
+		lgread(lg, msg, regs->edx, sizeof(msg));
+		msg[sizeof(msg)-1] = '\0';
+		kill_guest(lg, "CRASH: %s", msg);
+		break;
+	}
+	case LHCALL_FLUSH_TLB:
+		if (regs->edx)
+			guest_pagetable_clear_all(lg);
+		else
+			guest_pagetable_flush_user(lg);
+		break;
+	case LHCALL_GET_WALLCLOCK: {
+		struct timespec ts;
+		ktime_get_real_ts(&ts);
+		regs->eax = ts.tv_sec;
+		break;
+	}
+	case LHCALL_BIND_DMA:
+		regs->eax = bind_dma(lg, regs->edx, regs->ebx,
+				     regs->ecx >> 8, regs->ecx & 0xFF);
+		break;
+	case LHCALL_SEND_DMA:
+		send_dma(lg, regs->edx, regs->ebx);
+		break;
+	case LHCALL_LOAD_GDT:
+		load_guest_gdt(lg, regs->edx, regs->ebx);
+		break;
+	case LHCALL_LOAD_IDT_ENTRY:
+		load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx);
+		break;
+	case LHCALL_NEW_PGTABLE:
+		guest_new_pagetable(lg, regs->edx);
+		break;
+	case LHCALL_SET_STACK:
+		guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx);
+		break;
+	case LHCALL_SET_PTE:
+		guest_set_pte(lg, regs->edx, regs->ebx, mkgpte(regs->ecx));
+		break;
+	case LHCALL_SET_PMD:
+		guest_set_pmd(lg, regs->edx, regs->ebx);
+		break;
+	case LHCALL_LOAD_TLS:
+		guest_load_tls(lg, regs->edx);
+		break;
+	case LHCALL_SET_CLOCKEVENT:
+		guest_set_clockevent(lg, regs->edx);
+		break;
+	case LHCALL_TS:
+		lg->ts = regs->edx;
+		break;
+	case LHCALL_HALT:
+		lg->halted = 1;
+		break;
+	default:
+		kill_guest(lg, "Bad hypercall %li\n", regs->eax);
+	}
+}
+
+/* We always do queued calls before actual hypercall. */
+static void do_async_hcalls(struct lguest *lg)
+{
+	unsigned int i;
+	u8 st[LHCALL_RING_SIZE];
+
+	if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)))
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(st); i++) {
+		struct lguest_regs regs;
+		unsigned int n = lg->next_hcall;
+
+		if (st[n] == 0xFF)
+			break;
+
+		if (++lg->next_hcall == LHCALL_RING_SIZE)
+			lg->next_hcall = 0;
+
+		if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax)
+		    || get_user(regs.edx, &lg->lguest_data->hcalls[n].edx)
+		    || get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx)
+		    || get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx)) {
+			kill_guest(lg, "Fetching async hypercalls");
+			break;
+		}
+
+		do_hcall(lg, &regs);
+		if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) {
+			kill_guest(lg, "Writing result for async hypercall");
+			break;
+		}
+
+		if (lg->dma_is_pending)
+			break;
+	}
+}
+
+static void initialize(struct lguest *lg)
+{
+	u32 tsc_speed;
+
+	if (lg->regs->eax != LHCALL_LGUEST_INIT) {
+		kill_guest(lg, "hypercall %li before LGUEST_INIT",
+			   lg->regs->eax);
+		return;
+	}
+
+	/* We only tell the guest to use the TSC if it's reliable. */
+	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
+		tsc_speed = tsc_khz;
+	else
+		tsc_speed = 0;
+
+	lg->lguest_data = (struct lguest_data __user *)lg->regs->edx;
+	/* We check here so we can simply copy_to_user/from_user */
+	if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) {
+		kill_guest(lg, "bad guest page %p", lg->lguest_data);
+		return;
+	}
+	if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
+	    || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)
+	    /* We reserve the top pgd entry. */
+	    || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
+	    || put_user(tsc_speed, &lg->lguest_data->tsc_khz)
+	    || put_user(lg->guestid, &lg->lguest_data->guestid))
+		kill_guest(lg, "bad guest page %p", lg->lguest_data);
+
+	/* This is the one case where the above accesses might have
+	 * been the first write to a Guest page.  This may have caused
+	 * a copy-on-write fault, but the Guest might be referring to
+	 * the old (read-only) page. */
+	guest_pagetable_clear_all(lg);
+}
+
+/* Even if we go out to userspace and come back, we don't want to do
+ * the hypercall again. */
+static void clear_hcall(struct lguest *lg)
+{
+	lg->regs->trapnum = 255;
+}
+
+void do_hypercalls(struct lguest *lg)
+{
+	if (unlikely(!lg->lguest_data)) {
+		if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
+			initialize(lg);
+			clear_hcall(lg);
+		}
+		return;
+	}
+
+	do_async_hcalls(lg);
+	if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
+		do_hcall(lg, lg->regs);
+		clear_hcall(lg);
+	}
+}
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
new file mode 100644
index 000000000000..d9de5bbc613f
--- /dev/null
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -0,0 +1,268 @@
+#include <linux/uaccess.h>
+#include "lg.h"
+
+static unsigned long idt_address(u32 lo, u32 hi)
+{
+	return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
+}
+
+static int idt_type(u32 lo, u32 hi)
+{
+	return (hi >> 8) & 0xF;
+}
+
+static int idt_present(u32 lo, u32 hi)
+{
+	return (hi & 0x8000);
+}
+
+static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
+{
+	*gstack -= 4;
+	lgwrite_u32(lg, *gstack, val);
+}
+
+static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
+{
+	unsigned long gstack;
+	u32 eflags, ss, irq_enable;
+
+	/* If they want a ring change, we use new stack and push old ss/esp */
+	if ((lg->regs->ss&0x3) != GUEST_PL) {
+		gstack = guest_pa(lg, lg->esp1);
+		ss = lg->ss1;
+		push_guest_stack(lg, &gstack, lg->regs->ss);
+		push_guest_stack(lg, &gstack, lg->regs->esp);
+	} else {
+		gstack = guest_pa(lg, lg->regs->esp);
+		ss = lg->regs->ss;
+	}
+
+	/* We use IF bit in eflags to indicate whether irqs were disabled
+	   (it's always 0, since irqs are enabled when guest is running). */
+	eflags = lg->regs->eflags;
+	if (get_user(irq_enable, &lg->lguest_data->irq_enabled))
+		irq_enable = 0;
+	eflags |= (irq_enable & X86_EFLAGS_IF);
+
+	push_guest_stack(lg, &gstack, eflags);
+	push_guest_stack(lg, &gstack, lg->regs->cs);
+	push_guest_stack(lg, &gstack, lg->regs->eip);
+
+	if (has_err)
+		push_guest_stack(lg, &gstack, lg->regs->errcode);
+
+	/* Change the real stack so switcher returns to trap handler */
+	lg->regs->ss = ss;
+	lg->regs->esp = gstack + lg->page_offset;
+	lg->regs->cs = (__KERNEL_CS|GUEST_PL);
+	lg->regs->eip = idt_address(lo, hi);
+
+	/* Disable interrupts for an interrupt gate. */
+	if (idt_type(lo, hi) == 0xE)
+		if (put_user(0, &lg->lguest_data->irq_enabled))
+			kill_guest(lg, "Disabling interrupts");
+}
+
+void maybe_do_interrupt(struct lguest *lg)
+{
+	unsigned int irq;
+	DECLARE_BITMAP(blk, LGUEST_IRQS);
+	struct desc_struct *idt;
+
+	if (!lg->lguest_data)
+		return;
+
+	/* Mask out any interrupts they have blocked. */
+	if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts,
+			   sizeof(blk)))
+		return;
+
+	bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS);
+
+	irq = find_first_bit(blk, LGUEST_IRQS);
+	if (irq >= LGUEST_IRQS)
+		return;
+
+	if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end)
+		return;
+
+	/* If they're halted, we re-enable interrupts. */
+	if (lg->halted) {
+		/* Re-enable interrupts. */
+		if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled))
+			kill_guest(lg, "Re-enabling interrupts");
+		lg->halted = 0;
+	} else {
+		/* Maybe they have interrupts disabled? */
+		u32 irq_enabled;
+		if (get_user(irq_enabled, &lg->lguest_data->irq_enabled))
+			irq_enabled = 0;
+		if (!irq_enabled)
+			return;
+	}
+
+	idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq];
+	if (idt_present(idt->a, idt->b)) {
+		clear_bit(irq, lg->irqs_pending);
+		set_guest_interrupt(lg, idt->a, idt->b, 0);
+	}
+}
+
+static int has_err(unsigned int trap)
+{
+	return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
+}
+
+int deliver_trap(struct lguest *lg, unsigned int num)
+{
+	u32 lo = lg->idt[num].a, hi = lg->idt[num].b;
+
+	if (!idt_present(lo, hi))
+		return 0;
+	set_guest_interrupt(lg, lo, hi, has_err(num));
+	return 1;
+}
+
+static int direct_trap(const struct lguest *lg,
+		       const struct desc_struct *trap,
+		       unsigned int num)
+{
+	/* Hardware interrupts don't go to guest (except syscall). */
+	if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR)
+		return 0;
+
+	/* We intercept page fault (demand shadow paging & cr2 saving)
+	   protection fault (in/out emulation) and device not
+	   available (TS handling), and hypercall */
+	if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY)
+		return 0;
+
+	/* Interrupt gates (0xE) or not present (0x0) can't go direct. */
+	return idt_type(trap->a, trap->b) == 0xF;
+}
+
+void pin_stack_pages(struct lguest *lg)
+{
+	unsigned int i;
+
+	for (i = 0; i < lg->stack_pages; i++)
+		pin_page(lg, lg->esp1 - i * PAGE_SIZE);
+}
+
+void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
+{
+	/* You cannot have a stack segment with priv level 0. */
+	if ((seg & 0x3) != GUEST_PL)
+		kill_guest(lg, "bad stack segment %i", seg);
+	if (pages > 2)
+		kill_guest(lg, "bad stack pages %u", pages);
+	lg->ss1 = seg;
+	lg->esp1 = esp;
+	lg->stack_pages = pages;
+	pin_stack_pages(lg);
+}
+
+/* Set up trap in IDT. */
+static void set_trap(struct lguest *lg, struct desc_struct *trap,
+		     unsigned int num, u32 lo, u32 hi)
+{
+	u8 type = idt_type(lo, hi);
+
+	if (!idt_present(lo, hi)) {
+		trap->a = trap->b = 0;
+		return;
+	}
+
+	if (type != 0xE && type != 0xF)
+		kill_guest(lg, "bad IDT type %i", type);
+
+	trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
+	trap->b = (hi&0xFFFFEF00);
+}
+
+void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
+{
+	/* Guest never handles: NMI, doublefault, hypercall, spurious irq. */
+	if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
+		return;
+
+	lg->changed |= CHANGED_IDT;
+	if (num < ARRAY_SIZE(lg->idt))
+		set_trap(lg, &lg->idt[num], num, lo, hi);
+	else if (num == SYSCALL_VECTOR)
+		set_trap(lg, &lg->syscall_idt, num, lo, hi);
+}
+
+static void default_idt_entry(struct desc_struct *idt,
+			      int trap,
+			      const unsigned long handler)
+{
+	u32 flags = 0x8e00;
+
+	/* They can't "int" into any of them except hypercall. */
+	if (trap == LGUEST_TRAP_ENTRY)
+		flags |= (GUEST_PL << 13);
+
+	idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF);
+	idt->b = (handler&0xFFFF0000) | flags;
+}
+
+void setup_default_idt_entries(struct lguest_ro_state *state,
+			       const unsigned long *def)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++)
+		default_idt_entry(&state->guest_idt[i], i, def[i]);
+}
+
+void copy_traps(const struct lguest *lg, struct desc_struct *idt,
+		const unsigned long *def)
+{
+	unsigned int i;
+
+	/* All hardware interrupts are same whatever the guest: only the
+	 * traps might be different. */
+	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) {
+		if (direct_trap(lg, &lg->idt[i], i))
+			idt[i] = lg->idt[i];
+		else
+			default_idt_entry(&idt[i], i, def[i]);
+	}
+	i = SYSCALL_VECTOR;
+	if (direct_trap(lg, &lg->syscall_idt, i))
+		idt[i] = lg->syscall_idt;
+	else
+		default_idt_entry(&idt[i], i, def[i]);
+}
+
+void guest_set_clockevent(struct lguest *lg, unsigned long delta)
+{
+	ktime_t expires;
+
+	if (unlikely(delta == 0)) {
+		/* Clock event device is shutting down. */
+		hrtimer_cancel(&lg->hrt);
+		return;
+	}
+
+	expires = ktime_add_ns(ktime_get_real(), delta);
+	hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS);
+}
+
+static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
+{
+	struct lguest *lg = container_of(timer, struct lguest, hrt);
+
+	set_bit(0, lg->irqs_pending);
+	if (lg->halted)
+		wake_up_process(lg->tsk);
+	return HRTIMER_NORESTART;
+}
+
+void init_clockdev(struct lguest *lg)
+{
+	hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+	lg->hrt.function = clockdev_fn;
+}
diff --git a/drivers/lguest/io.c b/drivers/lguest/io.c
new file mode 100644
index 000000000000..06bdba2337ef
--- /dev/null
+++ b/drivers/lguest/io.c
@@ -0,0 +1,399 @@
+/* Simple I/O model for guests, based on shared memory.
+ * Copyright (C) 2006 Rusty Russell IBM Corporation
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ */
+#include <linux/types.h>
+#include <linux/futex.h>
+#include <linux/jhash.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/uaccess.h>
+#include "lg.h"
+
+static struct list_head dma_hash[61];
+
+void lguest_io_init(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(dma_hash); i++)
+		INIT_LIST_HEAD(&dma_hash[i]);
+}
+
+/* FIXME: allow multi-page lengths. */
+static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma)
+{
+	unsigned int i;
+
+	for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+		if (!dma->len[i])
+			return 1;
+		if (!lguest_address_ok(lg, dma->addr[i], dma->len[i]))
+			goto kill;
+		if (dma->len[i] > PAGE_SIZE)
+			goto kill;
+		/* We could do over a page, but is it worth it? */
+		if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE)
+			goto kill;
+	}
+	return 1;
+
+kill:
+	kill_guest(lg, "bad DMA entry: %u@%#lx", dma->len[i], dma->addr[i]);
+	return 0;
+}
+
+static unsigned int hash(const union futex_key *key)
+{
+	return jhash2((u32*)&key->both.word,
+		      (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+		      key->both.offset)
+		% ARRAY_SIZE(dma_hash);
+}
+
+static inline int key_eq(const union futex_key *a, const union futex_key *b)
+{
+	return (a->both.word == b->both.word
+		&& a->both.ptr == b->both.ptr
+		&& a->both.offset == b->both.offset);
+}
+
+/* Must hold read lock on dmainfo owner's current->mm->mmap_sem */
+static void unlink_dma(struct lguest_dma_info *dmainfo)
+{
+	BUG_ON(!mutex_is_locked(&lguest_lock));
+	dmainfo->interrupt = 0;
+	list_del(&dmainfo->list);
+	drop_futex_key_refs(&dmainfo->key);
+}
+
+static int unbind_dma(struct lguest *lg,
+		      const union futex_key *key,
+		      unsigned long dmas)
+{
+	int i, ret = 0;
+
+	for (i = 0; i < LGUEST_MAX_DMA; i++) {
+		if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) {
+			unlink_dma(&lg->dma[i]);
+			ret = 1;
+			break;
+		}
+	}
+	return ret;
+}
+
+int bind_dma(struct lguest *lg,
+	     unsigned long ukey, unsigned long dmas, u16 numdmas, u8 interrupt)
+{
+	unsigned int i;
+	int ret = 0;
+	union futex_key key;
+	struct rw_semaphore *fshared = &current->mm->mmap_sem;
+
+	if (interrupt >= LGUEST_IRQS)
+		return 0;
+
+	mutex_lock(&lguest_lock);
+	down_read(fshared);
+	if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
+		kill_guest(lg, "bad dma key %#lx", ukey);
+		goto unlock;
+	}
+	get_futex_key_refs(&key);
+
+	if (interrupt == 0)
+		ret = unbind_dma(lg, &key, dmas);
+	else {
+		for (i = 0; i < LGUEST_MAX_DMA; i++) {
+			if (lg->dma[i].interrupt)
+				continue;
+
+			lg->dma[i].dmas = dmas;
+			lg->dma[i].num_dmas = numdmas;
+			lg->dma[i].next_dma = 0;
+			lg->dma[i].key = key;
+			lg->dma[i].guestid = lg->guestid;
+			lg->dma[i].interrupt = interrupt;
+			list_add(&lg->dma[i].list, &dma_hash[hash(&key)]);
+			ret = 1;
+			goto unlock;
+		}
+	}
+	drop_futex_key_refs(&key);
+unlock:
+ 	up_read(fshared);
+	mutex_unlock(&lguest_lock);
+	return ret;
+}
+
+/* lgread from another guest */
+static int lgread_other(struct lguest *lg,
+			void *buf, u32 addr, unsigned bytes)
+{
+	if (!lguest_address_ok(lg, addr, bytes)
+	    || access_process_vm(lg->tsk, addr, buf, bytes, 0) != bytes) {
+		memset(buf, 0, bytes);
+		kill_guest(lg, "bad address in registered DMA struct");
+		return 0;
+	}
+	return 1;
+}
+
+/* lgwrite to another guest */
+static int lgwrite_other(struct lguest *lg, u32 addr,
+			 const void *buf, unsigned bytes)
+{
+	if (!lguest_address_ok(lg, addr, bytes)
+	    || (access_process_vm(lg->tsk, addr, (void *)buf, bytes, 1)
+		!= bytes)) {
+		kill_guest(lg, "bad address writing to registered DMA");
+		return 0;
+	}
+	return 1;
+}
+
+static u32 copy_data(struct lguest *srclg,
+		     const struct lguest_dma *src,
+		     const struct lguest_dma *dst,
+		     struct page *pages[])
+{
+	unsigned int totlen, si, di, srcoff, dstoff;
+	void *maddr = NULL;
+
+	totlen = 0;
+	si = di = 0;
+	srcoff = dstoff = 0;
+	while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si]
+	       && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) {
+		u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff);
+
+		if (!maddr)
+			maddr = kmap(pages[di]);
+
+		/* FIXME: This is not completely portable, since
+		   archs do different things for copy_to_user_page. */
+		if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE,
+				   (void *__user)src->addr[si], len) != 0) {
+			kill_guest(srclg, "bad address in sending DMA");
+			totlen = 0;
+			break;
+		}
+
+		totlen += len;
+		srcoff += len;
+		dstoff += len;
+		if (srcoff == src->len[si]) {
+			si++;
+			srcoff = 0;
+		}
+		if (dstoff == dst->len[di]) {
+			kunmap(pages[di]);
+			maddr = NULL;
+			di++;
+			dstoff = 0;
+		}
+	}
+
+	if (maddr)
+		kunmap(pages[di]);
+
+	return totlen;
+}
+
+/* Src is us, ie. current. */
+static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src,
+		  struct lguest *dstlg, const struct lguest_dma *dst)
+{
+	int i;
+	u32 ret;
+	struct page *pages[LGUEST_MAX_DMA_SECTIONS];
+
+	if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src))
+		return 0;
+
+	/* First get the destination pages */
+	for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+		if (dst->len[i] == 0)
+			break;
+		if (get_user_pages(dstlg->tsk, dstlg->mm,
+				   dst->addr[i], 1, 1, 1, pages+i, NULL)
+		    != 1) {
+			kill_guest(dstlg, "Error mapping DMA pages");
+			ret = 0;
+			goto drop_pages;
+		}
+	}
+
+	/* Now copy until we run out of src or dst. */
+	ret = copy_data(srclg, src, dst, pages);
+
+drop_pages:
+	while (--i >= 0)
+		put_page(pages[i]);
+	return ret;
+}
+
+static int dma_transfer(struct lguest *srclg,
+			unsigned long udma,
+			struct lguest_dma_info *dst)
+{
+	struct lguest_dma dst_dma, src_dma;
+	struct lguest *dstlg;
+	u32 i, dma = 0;
+
+	dstlg = &lguests[dst->guestid];
+	/* Get our dma list. */
+	lgread(srclg, &src_dma, udma, sizeof(src_dma));
+
+	/* We can't deadlock against them dmaing to us, because this
+	 * is all under the lguest_lock. */
+	down_read(&dstlg->mm->mmap_sem);
+
+	for (i = 0; i < dst->num_dmas; i++) {
+		dma = (dst->next_dma + i) % dst->num_dmas;
+		if (!lgread_other(dstlg, &dst_dma,
+				  dst->dmas + dma * sizeof(struct lguest_dma),
+				  sizeof(dst_dma))) {
+			goto fail;
+		}
+		if (!dst_dma.used_len)
+			break;
+	}
+	if (i != dst->num_dmas) {
+		unsigned long used_lenp;
+		unsigned int ret;
+
+		ret = do_dma(srclg, &src_dma, dstlg, &dst_dma);
+		/* Put used length in src. */
+		lgwrite_u32(srclg,
+			    udma+offsetof(struct lguest_dma, used_len), ret);
+		if (ret == 0 && src_dma.len[0] != 0)
+			goto fail;
+
+		/* Make sure destination sees contents before length. */
+		wmb();
+		used_lenp = dst->dmas
+			+ dma * sizeof(struct lguest_dma)
+			+ offsetof(struct lguest_dma, used_len);
+		lgwrite_other(dstlg, used_lenp, &ret, sizeof(ret));
+		dst->next_dma++;
+	}
+ 	up_read(&dstlg->mm->mmap_sem);
+
+	/* Do this last so dst doesn't simply sleep on lock. */
+	set_bit(dst->interrupt, dstlg->irqs_pending);
+	wake_up_process(dstlg->tsk);
+	return i == dst->num_dmas;
+
+fail:
+	up_read(&dstlg->mm->mmap_sem);
+	return 0;
+}
+
+void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma)
+{
+	union futex_key key;
+	int empty = 0;
+	struct rw_semaphore *fshared = &current->mm->mmap_sem;
+
+again:
+	mutex_lock(&lguest_lock);
+	down_read(fshared);
+	if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
+		kill_guest(lg, "bad sending DMA key");
+		goto unlock;
+	}
+	/* Shared mapping?  Look for other guests... */
+	if (key.shared.offset & 1) {
+		struct lguest_dma_info *i;
+		list_for_each_entry(i, &dma_hash[hash(&key)], list) {
+			if (i->guestid == lg->guestid)
+				continue;
+			if (!key_eq(&key, &i->key))
+				continue;
+
+			empty += dma_transfer(lg, udma, i);
+			break;
+		}
+		if (empty == 1) {
+			/* Give any recipients one chance to restock. */
+			up_read(&current->mm->mmap_sem);
+			mutex_unlock(&lguest_lock);
+			empty++;
+			goto again;
+		}
+	} else {
+		/* Private mapping: tell our userspace. */
+		lg->dma_is_pending = 1;
+		lg->pending_dma = udma;
+		lg->pending_key = ukey;
+	}
+unlock:
+	up_read(fshared);
+	mutex_unlock(&lguest_lock);
+}
+
+void release_all_dma(struct lguest *lg)
+{
+	unsigned int i;
+
+	BUG_ON(!mutex_is_locked(&lguest_lock));
+
+	down_read(&lg->mm->mmap_sem);
+	for (i = 0; i < LGUEST_MAX_DMA; i++) {
+		if (lg->dma[i].interrupt)
+			unlink_dma(&lg->dma[i]);
+	}
+	up_read(&lg->mm->mmap_sem);
+}
+
+/* Userspace wants a dma buffer from this guest. */
+unsigned long get_dma_buffer(struct lguest *lg,
+			     unsigned long ukey, unsigned long *interrupt)
+{
+	unsigned long ret = 0;
+	union futex_key key;
+	struct lguest_dma_info *i;
+	struct rw_semaphore *fshared = &current->mm->mmap_sem;
+
+	mutex_lock(&lguest_lock);
+	down_read(fshared);
+	if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
+		kill_guest(lg, "bad registered DMA buffer");
+		goto unlock;
+	}
+	list_for_each_entry(i, &dma_hash[hash(&key)], list) {
+		if (key_eq(&key, &i->key) && i->guestid == lg->guestid) {
+			unsigned int j;
+			for (j = 0; j < i->num_dmas; j++) {
+				struct lguest_dma dma;
+
+				ret = i->dmas + j * sizeof(struct lguest_dma);
+				lgread(lg, &dma, ret, sizeof(dma));
+				if (dma.used_len == 0)
+					break;
+			}
+			*interrupt = i->interrupt;
+			break;
+		}
+	}
+unlock:
+	up_read(fshared);
+	mutex_unlock(&lguest_lock);
+	return ret;
+}
+
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
new file mode 100644
index 000000000000..3e2ddfbc816e
--- /dev/null
+++ b/drivers/lguest/lg.h
@@ -0,0 +1,261 @@
+#ifndef _LGUEST_H
+#define _LGUEST_H
+
+#include <asm/desc.h>
+
+#define GDT_ENTRY_LGUEST_CS	10
+#define GDT_ENTRY_LGUEST_DS	11
+#define LGUEST_CS		(GDT_ENTRY_LGUEST_CS * 8)
+#define LGUEST_DS		(GDT_ENTRY_LGUEST_DS * 8)
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/stringify.h>
+#include <linux/binfmts.h>
+#include <linux/futex.h>
+#include <linux/lguest.h>
+#include <linux/lguest_launcher.h>
+#include <linux/wait.h>
+#include <linux/err.h>
+#include <asm/semaphore.h>
+#include "irq_vectors.h"
+
+#define GUEST_PL 1
+
+struct lguest_regs
+{
+	/* Manually saved part. */
+	unsigned long ebx, ecx, edx;
+	unsigned long esi, edi, ebp;
+	unsigned long gs;
+	unsigned long eax;
+	unsigned long fs, ds, es;
+	unsigned long trapnum, errcode;
+	/* Trap pushed part */
+	unsigned long eip;
+	unsigned long cs;
+	unsigned long eflags;
+	unsigned long esp;
+	unsigned long ss;
+};
+
+void free_pagetables(void);
+int init_pagetables(struct page **switcher_page, unsigned int pages);
+
+/* Full 4G segment descriptors, suitable for CS and DS. */
+#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00})
+#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300})
+
+struct lguest_dma_info
+{
+	struct list_head list;
+	union futex_key key;
+	unsigned long dmas;
+	u16 next_dma;
+	u16 num_dmas;
+	u16 guestid;
+	u8 interrupt; 	/* 0 when not registered */
+};
+
+/* We have separate types for the guest's ptes & pgds and the shadow ptes &
+ * pgds.  Since this host might use three-level pagetables and the guest and
+ * shadow pagetables don't, we can't use the normal pte_t/pgd_t. */
+typedef union {
+	struct { unsigned flags:12, pfn:20; };
+	struct { unsigned long val; } raw;
+} spgd_t;
+typedef union {
+	struct { unsigned flags:12, pfn:20; };
+	struct { unsigned long val; } raw;
+} spte_t;
+typedef union {
+	struct { unsigned flags:12, pfn:20; };
+	struct { unsigned long val; } raw;
+} gpgd_t;
+typedef union {
+	struct { unsigned flags:12, pfn:20; };
+	struct { unsigned long val; } raw;
+} gpte_t;
+#define mkgpte(_val) ((gpte_t){.raw.val = _val})
+#define mkgpgd(_val) ((gpgd_t){.raw.val = _val})
+
+struct pgdir
+{
+	unsigned long cr3;
+	spgd_t *pgdir;
+};
+
+/* This is a guest-specific page (mapped ro) into the guest. */
+struct lguest_ro_state
+{
+	/* Host information we need to restore when we switch back. */
+	u32 host_cr3;
+	struct Xgt_desc_struct host_idt_desc;
+	struct Xgt_desc_struct host_gdt_desc;
+	u32 host_sp;
+
+	/* Fields which are used when guest is running. */
+	struct Xgt_desc_struct guest_idt_desc;
+	struct Xgt_desc_struct guest_gdt_desc;
+	struct i386_hw_tss guest_tss;
+	struct desc_struct guest_idt[IDT_ENTRIES];
+	struct desc_struct guest_gdt[GDT_ENTRIES];
+};
+
+/* We have two pages shared with guests, per cpu.  */
+struct lguest_pages
+{
+	/* This is the stack page mapped rw in guest */
+	char spare[PAGE_SIZE - sizeof(struct lguest_regs)];
+	struct lguest_regs regs;
+
+	/* This is the host state & guest descriptor page, ro in guest */
+	struct lguest_ro_state state;
+} __attribute__((aligned(PAGE_SIZE)));
+
+#define CHANGED_IDT		1
+#define CHANGED_GDT		2
+#define CHANGED_GDT_TLS		4 /* Actually a subset of CHANGED_GDT */
+#define CHANGED_ALL	        3
+
+/* The private info the thread maintains about the guest. */
+struct lguest
+{
+	/* At end of a page shared mapped over lguest_pages in guest.  */
+	unsigned long regs_page;
+	struct lguest_regs *regs;
+	struct lguest_data __user *lguest_data;
+	struct task_struct *tsk;
+	struct mm_struct *mm; 	/* == tsk->mm, but that becomes NULL on exit */
+	u16 guestid;
+	u32 pfn_limit;
+	u32 page_offset;
+	u32 cr2;
+	int halted;
+	int ts;
+	u32 next_hcall;
+	u32 esp1;
+	u8 ss1;
+
+	/* Do we need to stop what we're doing and return to userspace? */
+	int break_out;
+	wait_queue_head_t break_wq;
+
+	/* Bitmap of what has changed: see CHANGED_* above. */
+	int changed;
+	struct lguest_pages *last_pages;
+
+	/* We keep a small number of these. */
+	u32 pgdidx;
+	struct pgdir pgdirs[4];
+
+	/* Cached wakeup: we hold a reference to this task. */
+	struct task_struct *wake;
+
+	unsigned long noirq_start, noirq_end;
+	int dma_is_pending;
+	unsigned long pending_dma; /* struct lguest_dma */
+	unsigned long pending_key; /* address they're sending to */
+
+	unsigned int stack_pages;
+	u32 tsc_khz;
+
+	struct lguest_dma_info dma[LGUEST_MAX_DMA];
+
+	/* Dead? */
+	const char *dead;
+
+	/* The GDT entries copied into lguest_ro_state when running. */
+	struct desc_struct gdt[GDT_ENTRIES];
+
+	/* The IDT entries: some copied into lguest_ro_state when running. */
+	struct desc_struct idt[FIRST_EXTERNAL_VECTOR+LGUEST_IRQS];
+	struct desc_struct syscall_idt;
+
+	/* Virtual clock device */
+	struct hrtimer hrt;
+
+	/* Pending virtual interrupts */
+	DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
+};
+
+extern struct lguest lguests[];
+extern struct mutex lguest_lock;
+
+/* core.c: */
+u32 lgread_u32(struct lguest *lg, unsigned long addr);
+void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val);
+void lgread(struct lguest *lg, void *buf, unsigned long addr, unsigned len);
+void lgwrite(struct lguest *lg, unsigned long, const void *buf, unsigned len);
+int find_free_guest(void);
+int lguest_address_ok(const struct lguest *lg,
+		      unsigned long addr, unsigned long len);
+int run_guest(struct lguest *lg, unsigned long __user *user);
+
+
+/* interrupts_and_traps.c: */
+void maybe_do_interrupt(struct lguest *lg);
+int deliver_trap(struct lguest *lg, unsigned int num);
+void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi);
+void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages);
+void pin_stack_pages(struct lguest *lg);
+void setup_default_idt_entries(struct lguest_ro_state *state,
+			       const unsigned long *def);
+void copy_traps(const struct lguest *lg, struct desc_struct *idt,
+		const unsigned long *def);
+void guest_set_clockevent(struct lguest *lg, unsigned long delta);
+void init_clockdev(struct lguest *lg);
+
+/* segments.c: */
+void setup_default_gdt_entries(struct lguest_ro_state *state);
+void setup_guest_gdt(struct lguest *lg);
+void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num);
+void guest_load_tls(struct lguest *lg, unsigned long tls_array);
+void copy_gdt(const struct lguest *lg, struct desc_struct *gdt);
+void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt);
+
+/* page_tables.c: */
+int init_guest_pagetable(struct lguest *lg, unsigned long pgtable);
+void free_guest_pagetable(struct lguest *lg);
+void guest_new_pagetable(struct lguest *lg, unsigned long pgtable);
+void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 i);
+void guest_pagetable_clear_all(struct lguest *lg);
+void guest_pagetable_flush_user(struct lguest *lg);
+void guest_set_pte(struct lguest *lg, unsigned long cr3,
+		   unsigned long vaddr, gpte_t val);
+void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages);
+int demand_page(struct lguest *info, unsigned long cr2, int errcode);
+void pin_page(struct lguest *lg, unsigned long vaddr);
+
+/* lguest_user.c: */
+int lguest_device_init(void);
+void lguest_device_remove(void);
+
+/* io.c: */
+void lguest_io_init(void);
+int bind_dma(struct lguest *lg,
+	     unsigned long key, unsigned long udma, u16 numdmas, u8 interrupt);
+void send_dma(struct lguest *info, unsigned long key, unsigned long udma);
+void release_all_dma(struct lguest *lg);
+unsigned long get_dma_buffer(struct lguest *lg, unsigned long key,
+			     unsigned long *interrupt);
+
+/* hypercalls.c: */
+void do_hypercalls(struct lguest *lg);
+
+#define kill_guest(lg, fmt...)					\
+do {								\
+	if (!(lg)->dead) {					\
+		(lg)->dead = kasprintf(GFP_ATOMIC, fmt);	\
+		if (!(lg)->dead)				\
+			(lg)->dead = ERR_PTR(-ENOMEM);		\
+	}							\
+} while(0)
+
+static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
+{
+	return vaddr - lg->page_offset;
+}
+#endif	/* __ASSEMBLY__ */
+#endif	/* _LGUEST_H */
diff --git a/drivers/lguest/lguest.c b/drivers/lguest/lguest.c
index b3a72bd8d6f5..b9a58b78c990 100644
--- a/drivers/lguest/lguest.c
+++ b/drivers/lguest/lguest.c
@@ -25,6 +25,8 @@
 #include <linux/screen_info.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
 #include <linux/lguest.h>
 #include <linux/lguest_launcher.h>
 #include <linux/lguest_bus.h>
@@ -37,6 +39,7 @@
 #include <asm/e820.h>
 #include <asm/mce.h>
 #include <asm/io.h>
+//#include <asm/sched-clock.h>
 
 /* Declarations for definitions in lguest_guest.S */
 extern char lguest_noirq_start[], lguest_noirq_end[];
@@ -54,7 +57,6 @@ struct lguest_data lguest_data = {
 	.blocked_interrupts = { 1 }, /* Block timer interrupts */
 };
 struct lguest_device_desc *lguest_devices;
-static __initdata const struct lguest_boot_info *boot = __va(0);
 
 static enum paravirt_lazy_mode lazy_mode;
 static void lguest_lazy_mode(enum paravirt_lazy_mode mode)
@@ -210,7 +212,7 @@ static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
 	case 1:	/* Basic feature request. */
 		/* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
 		*ecx &= 0x00002201;
-		/* Similarly: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
+		/* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
 		*edx &= 0x07808101;
 		/* Host wants to know when we flush kernel pages: set PGE. */
 		*edx |= 0x00002000;
@@ -346,24 +348,104 @@ static unsigned long lguest_get_wallclock(void)
 	return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0);
 }
 
+static cycle_t lguest_clock_read(void)
+{
+	if (lguest_data.tsc_khz)
+		return native_read_tsc();
+	else
+		return jiffies;
+}
+
+/* This is what we tell the kernel is our clocksource.  */
+static struct clocksource lguest_clock = {
+	.name		= "lguest",
+	.rating		= 400,
+	.read		= lguest_clock_read,
+};
+
+/* We also need a "struct clock_event_device": Linux asks us to set it to go
+ * off some time in the future.  Actually, James Morris figured all this out, I
+ * just applied the patch. */
+static int lguest_clockevent_set_next_event(unsigned long delta,
+                                           struct clock_event_device *evt)
+{
+	if (delta < LG_CLOCK_MIN_DELTA) {
+		if (printk_ratelimit())
+			printk(KERN_DEBUG "%s: small delta %lu ns\n",
+			       __FUNCTION__, delta);
+		return -ETIME;
+	}
+	hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0);
+	return 0;
+}
+
+static void lguest_clockevent_set_mode(enum clock_event_mode mode,
+                                      struct clock_event_device *evt)
+{
+	switch (mode) {
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		/* A 0 argument shuts the clock down. */
+		hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0);
+		break;
+	case CLOCK_EVT_MODE_ONESHOT:
+		/* This is what we expect. */
+		break;
+	case CLOCK_EVT_MODE_PERIODIC:
+		BUG();
+	}
+}
+
+/* This describes our primitive timer chip. */
+static struct clock_event_device lguest_clockevent = {
+	.name                   = "lguest",
+	.features               = CLOCK_EVT_FEAT_ONESHOT,
+	.set_next_event         = lguest_clockevent_set_next_event,
+	.set_mode               = lguest_clockevent_set_mode,
+	.rating                 = INT_MAX,
+	.mult                   = 1,
+	.shift                  = 0,
+	.min_delta_ns           = LG_CLOCK_MIN_DELTA,
+	.max_delta_ns           = LG_CLOCK_MAX_DELTA,
+};
+
+/* This is the Guest timer interrupt handler (hardware interrupt 0).  We just
+ * call the clockevent infrastructure and it does whatever needs doing. */
 static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
 {
-	do_timer(hcall(LHCALL_TIMER_READ, 0, 0, 0));
-	update_process_times(user_mode_vm(get_irq_regs()));
+	unsigned long flags;
+
+	/* Don't interrupt us while this is running. */
+	local_irq_save(flags);
+	lguest_clockevent.event_handler(&lguest_clockevent);
+	local_irq_restore(flags);
 }
 
-static u64 sched_clock_base;
 static void lguest_time_init(void)
 {
 	set_irq_handler(0, lguest_time_irq);
-	hcall(LHCALL_TIMER_READ, 0, 0, 0);
-	sched_clock_base = jiffies_64;
-	enable_lguest_irq(0);
-}
 
-static unsigned long long lguest_sched_clock(void)
-{
-	return (jiffies_64 - sched_clock_base) * (1000000000 / HZ);
+	/* We use the TSC if the Host tells us we can, otherwise a dumb
+	 * jiffies-based clock. */
+	if (lguest_data.tsc_khz) {
+		lguest_clock.shift = 22;
+		lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,
+							 lguest_clock.shift);
+		lguest_clock.mask = CLOCKSOURCE_MASK(64);
+		lguest_clock.flags = CLOCK_SOURCE_IS_CONTINUOUS;
+	} else {
+		/* To understand this, start at kernel/time/jiffies.c... */
+		lguest_clock.shift = 8;
+		lguest_clock.mult = (((u64)NSEC_PER_SEC<<8)/ACTHZ) << 8;
+		lguest_clock.mask = CLOCKSOURCE_MASK(32);
+	}
+	clocksource_register(&lguest_clock);
+
+	/* We can't set cpumask in the initializer: damn C limitations! */
+	lguest_clockevent.cpumask = cpumask_of_cpu(0);
+	clockevents_register_device(&lguest_clockevent);
+
+	enable_lguest_irq(0);
 }
 
 static void lguest_load_esp0(struct tss_struct *tss,
@@ -418,8 +500,7 @@ static __init char *lguest_memory_setup(void)
 	/* We do this here because lockcheck barfs if before start_kernel */
 	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
 
-	e820.nr_map = 0;
-	add_memory_region(0, PFN_PHYS(boot->max_pfn), E820_RAM);
+	add_memory_region(E820_MAP->addr, E820_MAP->size, E820_MAP->type);
 	return "LGUEST";
 }
 
@@ -450,8 +531,13 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len)
 	return insn_len;
 }
 
-__init void lguest_init(void)
+__init void lguest_init(void *boot)
 {
+	/* Copy boot parameters first. */
+	memcpy(&boot_params, boot, PARAM_SIZE);
+	memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr),
+	       COMMAND_LINE_SIZE);
+
 	paravirt_ops.name = "lguest";
 	paravirt_ops.paravirt_enabled = 1;
 	paravirt_ops.kernel_rpl = 1;
@@ -498,10 +584,8 @@ __init void lguest_init(void)
 	paravirt_ops.time_init = lguest_time_init;
 	paravirt_ops.set_lazy_mode = lguest_lazy_mode;
 	paravirt_ops.wbinvd = lguest_wbinvd;
-	paravirt_ops.sched_clock = lguest_sched_clock;
 
 	hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
-	strncpy(boot_command_line, boot->cmdline, COMMAND_LINE_SIZE);
 
 	/* We use top of mem for initial pagetables. */
 	init_pg_tables_end = __pa(pg0);
@@ -532,13 +616,6 @@ __init void lguest_init(void)
 
 	add_preferred_console("hvc", 0, NULL);
 
-	if (boot->initrd_size) {
-		/* We stash this at top of memory. */
-		INITRD_START = boot->max_pfn*PAGE_SIZE - boot->initrd_size;
-		INITRD_SIZE = boot->initrd_size;
-		LOADER_TYPE = 0xFF;
-	}
-
 	pm_power_off = lguest_power_off;
 	start_kernel();
 }
diff --git a/drivers/lguest/lguest_asm.S b/drivers/lguest/lguest_asm.S
index 5ac3d20bb184..00046c57b5ba 100644
--- a/drivers/lguest/lguest_asm.S
+++ b/drivers/lguest/lguest_asm.S
@@ -10,7 +10,8 @@
  * This is where we begin: we have a magic signature which the launcher looks
  * for.  The plan is that the Linux boot protocol will be extended with a
  * "platform type" field which will guide us here from the normal entry point,
- * but for the moment this suffices.
+ * but for the moment this suffices.  We pass the virtual address of the boot
+ * info to lguest_init().
  *
  * We put it in .init.text will be discarded after boot.
  */
@@ -18,6 +19,8 @@
 .ascii "GenuineLguest"
 	/* Set up initial stack. */
  	movl $(init_thread_union+THREAD_SIZE),%esp
+	movl %esi, %eax
+	addl $__PAGE_OFFSET, %eax
 	jmp lguest_init
 
 /* The templates for inline patching. */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
new file mode 100644
index 000000000000..e90d7a783daf
--- /dev/null
+++ b/drivers/lguest/lguest_user.c
@@ -0,0 +1,236 @@
+/* Userspace control of the guest, via /dev/lguest. */
+#include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include "lg.h"
+
+static void setup_regs(struct lguest_regs *regs, unsigned long start)
+{
+	/* Write out stack in format lguest expects, so we can switch to it. */
+	regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
+	regs->cs = __KERNEL_CS|GUEST_PL;
+	regs->eflags = 0x202; 	/* Interrupts enabled. */
+	regs->eip = start;
+	/* esi points to our boot information (physical address 0) */
+}
+
+/* + addr */
+static long user_get_dma(struct lguest *lg, const u32 __user *input)
+{
+	unsigned long key, udma, irq;
+
+	if (get_user(key, input) != 0)
+		return -EFAULT;
+	udma = get_dma_buffer(lg, key, &irq);
+	if (!udma)
+		return -ENOENT;
+
+	/* We put irq number in udma->used_len. */
+	lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq);
+	return udma;
+}
+
+/* To force the Guest to stop running and return to the Launcher, the
+ * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest.  The
+ * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */
+static int break_guest_out(struct lguest *lg, const u32 __user *input)
+{
+	unsigned long on;
+
+	/* Fetch whether they're turning break on or off.. */
+	if (get_user(on, input) != 0)
+		return -EFAULT;
+
+	if (on) {
+		lg->break_out = 1;
+		/* Pop it out (may be running on different CPU) */
+		wake_up_process(lg->tsk);
+		/* Wait for them to reset it */
+		return wait_event_interruptible(lg->break_wq, !lg->break_out);
+	} else {
+		lg->break_out = 0;
+		wake_up(&lg->break_wq);
+		return 0;
+	}
+}
+
+/* + irq */
+static int user_send_irq(struct lguest *lg, const u32 __user *input)
+{
+	u32 irq;
+
+	if (get_user(irq, input) != 0)
+		return -EFAULT;
+	if (irq >= LGUEST_IRQS)
+		return -EINVAL;
+	set_bit(irq, lg->irqs_pending);
+	return 0;
+}
+
+static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
+{
+	struct lguest *lg = file->private_data;
+
+	if (!lg)
+		return -EINVAL;
+
+	/* If you're not the task which owns the guest, go away. */
+	if (current != lg->tsk)
+		return -EPERM;
+
+	if (lg->dead) {
+		size_t len;
+
+		if (IS_ERR(lg->dead))
+			return PTR_ERR(lg->dead);
+
+		len = min(size, strlen(lg->dead)+1);
+		if (copy_to_user(user, lg->dead, len) != 0)
+			return -EFAULT;
+		return len;
+	}
+
+	if (lg->dma_is_pending)
+		lg->dma_is_pending = 0;
+
+	return run_guest(lg, (unsigned long __user *)user);
+}
+
+/* Take: pfnlimit, pgdir, start, pageoffset. */
+static int initialize(struct file *file, const u32 __user *input)
+{
+	struct lguest *lg;
+	int err, i;
+	u32 args[4];
+
+	/* We grab the Big Lguest lock, which protects the global array
+	 * "lguests" and multiple simultaneous initializations. */
+	mutex_lock(&lguest_lock);
+
+	if (file->private_data) {
+		err = -EBUSY;
+		goto unlock;
+	}
+
+	if (copy_from_user(args, input, sizeof(args)) != 0) {
+		err = -EFAULT;
+		goto unlock;
+	}
+
+	i = find_free_guest();
+	if (i < 0) {
+		err = -ENOSPC;
+		goto unlock;
+	}
+	lg = &lguests[i];
+	lg->guestid = i;
+	lg->pfn_limit = args[0];
+	lg->page_offset = args[3];
+	lg->regs_page = get_zeroed_page(GFP_KERNEL);
+	if (!lg->regs_page) {
+		err = -ENOMEM;
+		goto release_guest;
+	}
+	lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs);
+
+	err = init_guest_pagetable(lg, args[1]);
+	if (err)
+		goto free_regs;
+
+	setup_regs(lg->regs, args[2]);
+	setup_guest_gdt(lg);
+	init_clockdev(lg);
+	lg->tsk = current;
+	lg->mm = get_task_mm(lg->tsk);
+	init_waitqueue_head(&lg->break_wq);
+	lg->last_pages = NULL;
+	file->private_data = lg;
+
+	mutex_unlock(&lguest_lock);
+
+	return sizeof(args);
+
+free_regs:
+	free_page(lg->regs_page);
+release_guest:
+	memset(lg, 0, sizeof(*lg));
+unlock:
+	mutex_unlock(&lguest_lock);
+	return err;
+}
+
+static ssize_t write(struct file *file, const char __user *input,
+		     size_t size, loff_t *off)
+{
+	struct lguest *lg = file->private_data;
+	u32 req;
+
+	if (get_user(req, input) != 0)
+		return -EFAULT;
+	input += sizeof(req);
+
+	if (req != LHREQ_INITIALIZE && !lg)
+		return -EINVAL;
+	if (lg && lg->dead)
+		return -ENOENT;
+
+	/* If you're not the task which owns the Guest, you can only break */
+	if (lg && current != lg->tsk && req != LHREQ_BREAK)
+		return -EPERM;
+
+	switch (req) {
+	case LHREQ_INITIALIZE:
+		return initialize(file, (const u32 __user *)input);
+	case LHREQ_GETDMA:
+		return user_get_dma(lg, (const u32 __user *)input);
+	case LHREQ_IRQ:
+		return user_send_irq(lg, (const u32 __user *)input);
+	case LHREQ_BREAK:
+		return break_guest_out(lg, (const u32 __user *)input);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int close(struct inode *inode, struct file *file)
+{
+	struct lguest *lg = file->private_data;
+
+	if (!lg)
+		return 0;
+
+	mutex_lock(&lguest_lock);
+	/* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
+	hrtimer_cancel(&lg->hrt);
+	release_all_dma(lg);
+	free_guest_pagetable(lg);
+	mmput(lg->mm);
+	if (!IS_ERR(lg->dead))
+		kfree(lg->dead);
+	free_page(lg->regs_page);
+	memset(lg, 0, sizeof(*lg));
+	mutex_unlock(&lguest_lock);
+	return 0;
+}
+
+static struct file_operations lguest_fops = {
+	.owner	 = THIS_MODULE,
+	.release = close,
+	.write	 = write,
+	.read	 = read,
+};
+static struct miscdevice lguest_dev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= "lguest",
+	.fops	= &lguest_fops,
+};
+
+int __init lguest_device_init(void)
+{
+	return misc_register(&lguest_dev);
+}
+
+void __exit lguest_device_remove(void)
+{
+	misc_deregister(&lguest_dev);
+}
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
new file mode 100644
index 000000000000..1b0ba09b1269
--- /dev/null
+++ b/drivers/lguest/page_tables.c
@@ -0,0 +1,411 @@
+/* Shadow page table operations.
+ * Copyright (C) Rusty Russell IBM Corporation 2006.
+ * GPL v2 and any later version */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/percpu.h>
+#include <asm/tlbflush.h>
+#include "lg.h"
+
+#define PTES_PER_PAGE_SHIFT 10
+#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT)
+#define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1)
+
+static DEFINE_PER_CPU(spte_t *, switcher_pte_pages);
+#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
+
+static unsigned vaddr_to_pgd_index(unsigned long vaddr)
+{
+	return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
+}
+
+/* These access the shadow versions (ie. the ones used by the CPU). */
+static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
+{
+	unsigned int index = vaddr_to_pgd_index(vaddr);
+
+	if (index >= SWITCHER_PGD_INDEX) {
+		kill_guest(lg, "attempt to access switcher pages");
+		index = 0;
+	}
+	return &lg->pgdirs[i].pgdir[index];
+}
+
+static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr)
+{
+	spte_t *page = __va(spgd.pfn << PAGE_SHIFT);
+	BUG_ON(!(spgd.flags & _PAGE_PRESENT));
+	return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE];
+}
+
+/* These access the guest versions. */
+static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr)
+{
+	unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
+	return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(gpgd_t);
+}
+
+static unsigned long gpte_addr(struct lguest *lg,
+			       gpgd_t gpgd, unsigned long vaddr)
+{
+	unsigned long gpage = gpgd.pfn << PAGE_SHIFT;
+	BUG_ON(!(gpgd.flags & _PAGE_PRESENT));
+	return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t);
+}
+
+/* Do a virtual -> physical mapping on a user page. */
+static unsigned long get_pfn(unsigned long virtpfn, int write)
+{
+	struct page *page;
+	unsigned long ret = -1UL;
+
+	down_read(&current->mm->mmap_sem);
+	if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT,
+			   1, write, 1, &page, NULL) == 1)
+		ret = page_to_pfn(page);
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
+static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write)
+{
+	spte_t spte;
+	unsigned long pfn;
+
+	/* We ignore the global flag. */
+	spte.flags = (gpte.flags & ~_PAGE_GLOBAL);
+	pfn = get_pfn(gpte.pfn, write);
+	if (pfn == -1UL) {
+		kill_guest(lg, "failed to get page %u", gpte.pfn);
+		/* Must not put_page() bogus page on cleanup. */
+		spte.flags = 0;
+	}
+	spte.pfn = pfn;
+	return spte;
+}
+
+static void release_pte(spte_t pte)
+{
+	if (pte.flags & _PAGE_PRESENT)
+		put_page(pfn_to_page(pte.pfn));
+}
+
+static void check_gpte(struct lguest *lg, gpte_t gpte)
+{
+	if ((gpte.flags & (_PAGE_PWT|_PAGE_PSE)) || gpte.pfn >= lg->pfn_limit)
+		kill_guest(lg, "bad page table entry");
+}
+
+static void check_gpgd(struct lguest *lg, gpgd_t gpgd)
+{
+	if ((gpgd.flags & ~_PAGE_TABLE) || gpgd.pfn >= lg->pfn_limit)
+		kill_guest(lg, "bad page directory entry");
+}
+
+/* FIXME: We hold reference to pages, which prevents them from being
+   swapped.  It'd be nice to have a callback when Linux wants to swap out. */
+
+/* We fault pages in, which allows us to update accessed/dirty bits.
+ * Return true if we got page. */
+int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
+{
+	gpgd_t gpgd;
+	spgd_t *spgd;
+	unsigned long gpte_ptr;
+	gpte_t gpte;
+	spte_t *spte;
+
+	gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr)));
+	if (!(gpgd.flags & _PAGE_PRESENT))
+		return 0;
+
+	spgd = spgd_addr(lg, lg->pgdidx, vaddr);
+	if (!(spgd->flags & _PAGE_PRESENT)) {
+		/* Get a page of PTEs for them. */
+		unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
+		/* FIXME: Steal from self in this case? */
+		if (!ptepage) {
+			kill_guest(lg, "out of memory allocating pte page");
+			return 0;
+		}
+		check_gpgd(lg, gpgd);
+		spgd->raw.val = (__pa(ptepage) | gpgd.flags);
+	}
+
+	gpte_ptr = gpte_addr(lg, gpgd, vaddr);
+	gpte = mkgpte(lgread_u32(lg, gpte_ptr));
+
+	/* No page? */
+	if (!(gpte.flags & _PAGE_PRESENT))
+		return 0;
+
+	/* Write to read-only page? */
+	if ((errcode & 2) && !(gpte.flags & _PAGE_RW))
+		return 0;
+
+	/* User access to a non-user page? */
+	if ((errcode & 4) && !(gpte.flags & _PAGE_USER))
+		return 0;
+
+	check_gpte(lg, gpte);
+	gpte.flags |= _PAGE_ACCESSED;
+	if (errcode & 2)
+		gpte.flags |= _PAGE_DIRTY;
+
+	/* We're done with the old pte. */
+	spte = spte_addr(lg, *spgd, vaddr);
+	release_pte(*spte);
+
+	/* We don't make it writable if this isn't a write: later
+	 * write will fault so we can set dirty bit in guest. */
+	if (gpte.flags & _PAGE_DIRTY)
+		*spte = gpte_to_spte(lg, gpte, 1);
+	else {
+		gpte_t ro_gpte = gpte;
+		ro_gpte.flags &= ~_PAGE_RW;
+		*spte = gpte_to_spte(lg, ro_gpte, 0);
+	}
+
+	/* Now we update dirty/accessed on guest. */
+	lgwrite_u32(lg, gpte_ptr, gpte.raw.val);
+	return 1;
+}
+
+/* This is much faster than the full demand_page logic. */
+static int page_writable(struct lguest *lg, unsigned long vaddr)
+{
+	spgd_t *spgd;
+	unsigned long flags;
+
+	spgd = spgd_addr(lg, lg->pgdidx, vaddr);
+	if (!(spgd->flags & _PAGE_PRESENT))
+		return 0;
+
+	flags = spte_addr(lg, *spgd, vaddr)->flags;
+	return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
+}
+
+void pin_page(struct lguest *lg, unsigned long vaddr)
+{
+	if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2))
+		kill_guest(lg, "bad stack page %#lx", vaddr);
+}
+
+static void release_pgd(struct lguest *lg, spgd_t *spgd)
+{
+	if (spgd->flags & _PAGE_PRESENT) {
+		unsigned int i;
+		spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT);
+		for (i = 0; i < PTES_PER_PAGE; i++)
+			release_pte(ptepage[i]);
+		free_page((long)ptepage);
+		spgd->raw.val = 0;
+	}
+}
+
+static void flush_user_mappings(struct lguest *lg, int idx)
+{
+	unsigned int i;
+	for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++)
+		release_pgd(lg, lg->pgdirs[idx].pgdir + i);
+}
+
+void guest_pagetable_flush_user(struct lguest *lg)
+{
+	flush_user_mappings(lg, lg->pgdidx);
+}
+
+static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
+{
+	unsigned int i;
+	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+		if (lg->pgdirs[i].cr3 == pgtable)
+			break;
+	return i;
+}
+
+static unsigned int new_pgdir(struct lguest *lg,
+			      unsigned long cr3,
+			      int *blank_pgdir)
+{
+	unsigned int next;
+
+	next = random32() % ARRAY_SIZE(lg->pgdirs);
+	if (!lg->pgdirs[next].pgdir) {
+		lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL);
+		if (!lg->pgdirs[next].pgdir)
+			next = lg->pgdidx;
+		else
+			/* There are no mappings: you'll need to re-pin */
+			*blank_pgdir = 1;
+	}
+	lg->pgdirs[next].cr3 = cr3;
+	/* Release all the non-kernel mappings. */
+	flush_user_mappings(lg, next);
+
+	return next;
+}
+
+void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
+{
+	int newpgdir, repin = 0;
+
+	newpgdir = find_pgdir(lg, pgtable);
+	if (newpgdir == ARRAY_SIZE(lg->pgdirs))
+		newpgdir = new_pgdir(lg, pgtable, &repin);
+	lg->pgdidx = newpgdir;
+	if (repin)
+		pin_stack_pages(lg);
+}
+
+static void release_all_pagetables(struct lguest *lg)
+{
+	unsigned int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+		if (lg->pgdirs[i].pgdir)
+			for (j = 0; j < SWITCHER_PGD_INDEX; j++)
+				release_pgd(lg, lg->pgdirs[i].pgdir + j);
+}
+
+void guest_pagetable_clear_all(struct lguest *lg)
+{
+	release_all_pagetables(lg);
+	pin_stack_pages(lg);
+}
+
+static void do_set_pte(struct lguest *lg, int idx,
+		       unsigned long vaddr, gpte_t gpte)
+{
+	spgd_t *spgd = spgd_addr(lg, idx, vaddr);
+	if (spgd->flags & _PAGE_PRESENT) {
+		spte_t *spte = spte_addr(lg, *spgd, vaddr);
+		release_pte(*spte);
+		if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
+			check_gpte(lg, gpte);
+			*spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY);
+		} else
+			spte->raw.val = 0;
+	}
+}
+
+void guest_set_pte(struct lguest *lg,
+		   unsigned long cr3, unsigned long vaddr, gpte_t gpte)
+{
+	/* Kernel mappings must be changed on all top levels. */
+	if (vaddr >= lg->page_offset) {
+		unsigned int i;
+		for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+			if (lg->pgdirs[i].pgdir)
+				do_set_pte(lg, i, vaddr, gpte);
+	} else {
+		int pgdir = find_pgdir(lg, cr3);
+		if (pgdir != ARRAY_SIZE(lg->pgdirs))
+			do_set_pte(lg, pgdir, vaddr, gpte);
+	}
+}
+
+void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx)
+{
+	int pgdir;
+
+	if (idx >= SWITCHER_PGD_INDEX)
+		return;
+
+	pgdir = find_pgdir(lg, cr3);
+	if (pgdir < ARRAY_SIZE(lg->pgdirs))
+		release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
+}
+
+int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
+{
+	/* We assume this in flush_user_mappings, so check now */
+	if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX)
+		return -EINVAL;
+	lg->pgdidx = 0;
+	lg->pgdirs[lg->pgdidx].cr3 = pgtable;
+	lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL);
+	if (!lg->pgdirs[lg->pgdidx].pgdir)
+		return -ENOMEM;
+	return 0;
+}
+
+void free_guest_pagetable(struct lguest *lg)
+{
+	unsigned int i;
+
+	release_all_pagetables(lg);
+	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+		free_page((long)lg->pgdirs[i].pgdir);
+}
+
+/* Caller must be preempt-safe */
+void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
+{
+	spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
+	spgd_t switcher_pgd;
+	spte_t regs_pte;
+
+	/* Since switcher less that 4MB, we simply mug top pte page. */
+	switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT;
+	switcher_pgd.flags = _PAGE_KERNEL;
+	lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
+
+	/* Map our regs page over stack page. */
+	regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT;
+	regs_pte.flags = _PAGE_KERNEL;
+	switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE]
+		= regs_pte;
+}
+
+static void free_switcher_pte_pages(void)
+{
+	unsigned int i;
+
+	for_each_possible_cpu(i)
+		free_page((long)switcher_pte_page(i));
+}
+
+static __init void populate_switcher_pte_page(unsigned int cpu,
+					      struct page *switcher_page[],
+					      unsigned int pages)
+{
+	unsigned int i;
+	spte_t *pte = switcher_pte_page(cpu);
+
+	for (i = 0; i < pages; i++) {
+		pte[i].pfn = page_to_pfn(switcher_page[i]);
+		pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED;
+	}
+
+	/* We only map this CPU's pages, so guest can't see others. */
+	i = pages + cpu*2;
+
+	/* First page (regs) is rw, second (state) is ro. */
+	pte[i].pfn = page_to_pfn(switcher_page[i]);
+	pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW;
+	pte[i+1].pfn = page_to_pfn(switcher_page[i+1]);
+	pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED;
+}
+
+__init int init_pagetables(struct page **switcher_page, unsigned int pages)
+{
+	unsigned int i;
+
+	for_each_possible_cpu(i) {
+		switcher_pte_page(i) = (spte_t *)get_zeroed_page(GFP_KERNEL);
+		if (!switcher_pte_page(i)) {
+			free_switcher_pte_pages();
+			return -ENOMEM;
+		}
+		populate_switcher_pte_page(i, switcher_page, pages);
+	}
+	return 0;
+}
+
+void free_pagetables(void)
+{
+	free_switcher_pte_pages();
+}
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
new file mode 100644
index 000000000000..1b2cfe89dcd5
--- /dev/null
+++ b/drivers/lguest/segments.c
@@ -0,0 +1,125 @@
+#include "lg.h"
+
+static int desc_ok(const struct desc_struct *gdt)
+{
+	/* MBZ=0, P=1, DT=1  */
+	return ((gdt->b & 0x00209000) == 0x00009000);
+}
+
+static int segment_present(const struct desc_struct *gdt)
+{
+	return gdt->b & 0x8000;
+}
+
+static int ignored_gdt(unsigned int num)
+{
+	return (num == GDT_ENTRY_TSS
+		|| num == GDT_ENTRY_LGUEST_CS
+		|| num == GDT_ENTRY_LGUEST_DS
+		|| num == GDT_ENTRY_DOUBLEFAULT_TSS);
+}
+
+/* We don't allow removal of CS, DS or SS; it doesn't make sense. */
+static void check_segment_use(struct lguest *lg, unsigned int desc)
+{
+	if (lg->regs->gs / 8 == desc)
+		lg->regs->gs = 0;
+	if (lg->regs->fs / 8 == desc)
+		lg->regs->fs = 0;
+	if (lg->regs->es / 8 == desc)
+		lg->regs->es = 0;
+	if (lg->regs->ds / 8 == desc
+	    || lg->regs->cs / 8 == desc
+	    || lg->regs->ss / 8 == desc)
+		kill_guest(lg, "Removed live GDT entry %u", desc);
+}
+
+static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
+{
+	unsigned int i;
+
+	for (i = start; i < end; i++) {
+		/* We never copy these ones to real gdt */
+		if (ignored_gdt(i))
+			continue;
+
+		/* We could fault in switch_to_guest if they are using
+		 * a removed segment. */
+		if (!segment_present(&lg->gdt[i])) {
+			check_segment_use(lg, i);
+			continue;
+		}
+
+		if (!desc_ok(&lg->gdt[i]))
+			kill_guest(lg, "Bad GDT descriptor %i", i);
+
+		/* DPL 0 presumably means "for use by guest". */
+		if ((lg->gdt[i].b & 0x00006000) == 0)
+			lg->gdt[i].b |= (GUEST_PL << 13);
+
+		/* Set accessed bit, since gdt isn't writable. */
+		lg->gdt[i].b |= 0x00000100;
+	}
+}
+
+void setup_default_gdt_entries(struct lguest_ro_state *state)
+{
+	struct desc_struct *gdt = state->guest_gdt;
+	unsigned long tss = (unsigned long)&state->guest_tss;
+
+	/* Hypervisor segments. */
+	gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+	gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+
+	/* This is the one which we *cannot* copy from guest, since tss
+	   is depended on this lguest_ro_state, ie. this cpu. */
+	gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16);
+	gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000)
+		| ((tss >> 16) & 0x000000FF);
+}
+
+void setup_guest_gdt(struct lguest *lg)
+{
+	lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
+	lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
+	lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
+	lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
+}
+
+/* This is a fast version for the common case where only the three TLS entries
+ * have changed. */
+void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
+{
+	unsigned int i;
+
+	for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
+		gdt[i] = lg->gdt[i];
+}
+
+void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
+{
+	unsigned int i;
+
+	for (i = 0; i < GDT_ENTRIES; i++)
+		if (!ignored_gdt(i))
+			gdt[i] = lg->gdt[i];
+}
+
+void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
+{
+	if (num > ARRAY_SIZE(lg->gdt))
+		kill_guest(lg, "too many gdt entries %i", num);
+
+	lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0]));
+	fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt));
+	lg->changed |= CHANGED_GDT;
+}
+
+void guest_load_tls(struct lguest *lg, unsigned long gtls)
+{
+	struct desc_struct *tls = &lg->gdt[GDT_ENTRY_TLS_MIN];
+
+	lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
+	fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
+	lg->changed |= CHANGED_GDT_TLS;
+}
diff --git a/drivers/lguest/switcher.S b/drivers/lguest/switcher.S
new file mode 100644
index 000000000000..eadd4cc299d2
--- /dev/null
+++ b/drivers/lguest/switcher.S
@@ -0,0 +1,159 @@
+/* This code sits at 0xFFC00000 to do the low-level guest<->host switch.
+
+   There is are two pages above us for this CPU (struct lguest_pages).
+   The second page (struct lguest_ro_state) becomes read-only after the
+   context switch.  The first page (the stack for traps) remains writable,
+   but while we're in here, the guest cannot be running.
+*/
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include "lg.h"
+
+.text
+ENTRY(start_switcher_text)
+
+/* %eax points to lguest pages for this CPU.  %ebx contains cr3 value.
+   All normal registers can be clobbered! */
+ENTRY(switch_to_guest)
+	/* Save host segments on host stack. */
+	pushl	%es
+	pushl	%ds
+	pushl	%gs
+	pushl	%fs
+	/* With CONFIG_FRAME_POINTER, gcc doesn't let us clobber this! */
+	pushl	%ebp
+	/* Save host stack. */
+	movl	%esp, LGUEST_PAGES_host_sp(%eax)
+	/* Switch to guest stack: if we get NMI we expect to be there. */
+	movl	%eax, %edx
+	addl	$LGUEST_PAGES_regs, %edx
+	movl	%edx, %esp
+	/* Switch to guest's GDT, IDT. */
+	lgdt	LGUEST_PAGES_guest_gdt_desc(%eax)
+	lidt	LGUEST_PAGES_guest_idt_desc(%eax)
+	/* Switch to guest's TSS while GDT still writable. */
+	movl	$(GDT_ENTRY_TSS*8), %edx
+	ltr	%dx
+	/* Set host's TSS GDT entry to available (clear byte 5 bit 2). */
+	movl	(LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
+	andb	$0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
+	/* Switch to guest page tables:	lguest_pages->state now read-only. */
+	movl	%ebx, %cr3
+	/* Restore guest regs */
+	popl	%ebx
+	popl	%ecx
+	popl	%edx
+	popl	%esi
+	popl	%edi
+	popl	%ebp
+	popl	%gs
+	popl	%eax
+	popl	%fs
+	popl	%ds
+	popl	%es
+	/* Skip error code and trap number */
+	addl	$8, %esp
+	iret
+
+#define SWITCH_TO_HOST							\
+	/* Save guest state */						\
+	pushl	%es;							\
+	pushl	%ds;							\
+	pushl	%fs;							\
+	pushl	%eax;							\
+	pushl	%gs;							\
+	pushl	%ebp;							\
+	pushl	%edi;							\
+	pushl	%esi;							\
+	pushl	%edx;							\
+	pushl	%ecx;							\
+	pushl	%ebx;							\
+	/* Load lguest ds segment for convenience. */			\
+	movl	$(LGUEST_DS), %eax;					\
+	movl	%eax, %ds;						\
+	/* Figure out where we are, based on stack (at top of regs). */	\
+	movl	%esp, %eax;						\
+	subl	$LGUEST_PAGES_regs, %eax;				\
+	/* Put trap number in %ebx before we switch cr3 and lose it. */ \
+	movl	LGUEST_PAGES_regs_trapnum(%eax), %ebx;			\
+	/* Switch to host page tables (host GDT, IDT and stack are in host   \
+	   mem, so need this first) */					\
+	movl	LGUEST_PAGES_host_cr3(%eax), %edx;			\
+	movl	%edx, %cr3;						\
+	/* Set guest's TSS to available (clear byte 5 bit 2). */	\
+	andb	$0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
+	/* Switch to host's GDT & IDT. */				\
+	lgdt	LGUEST_PAGES_host_gdt_desc(%eax);			\
+	lidt	LGUEST_PAGES_host_idt_desc(%eax);			\
+	/* Switch to host's stack. */					\
+	movl	LGUEST_PAGES_host_sp(%eax), %esp;			\
+	/* Switch to host's TSS */					\
+	movl	$(GDT_ENTRY_TSS*8), %edx;				\
+	ltr	%dx;							\
+	popl	%ebp;							\
+	popl	%fs;							\
+	popl	%gs;							\
+	popl	%ds;							\
+	popl	%es
+
+/* Return to run_guest_once. */
+return_to_host:
+	SWITCH_TO_HOST
+	iret
+
+deliver_to_host:
+	SWITCH_TO_HOST
+	/* Decode IDT and jump to hosts' irq handler.  When that does iret, it
+	 * will return to run_guest_once.  This is a feature. */
+	movl	(LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
+	leal	(%edx,%ebx,8), %eax
+	movzwl	(%eax),%edx
+	movl	4(%eax), %eax
+	xorw	%ax, %ax
+	orl	%eax, %edx
+	jmp	*%edx
+
+/* Real hardware interrupts are delivered straight to the host.  Others
+   cause us to return to run_guest_once so it can decide what to do.  Note
+   that some of these are overridden by the guest to deliver directly, and
+   never enter here (see load_guest_idt_entry). */
+.macro IRQ_STUB N TARGET
+	.data; .long 1f; .text; 1:
+ /* Make an error number for most traps, which don't have one. */
+ .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
+	pushl	$0
+ .endif
+	pushl	$\N
+	jmp	\TARGET
+	ALIGN
+.endm
+
+.macro IRQ_STUBS FIRST LAST TARGET
+ irq=\FIRST
+ .rept \LAST-\FIRST+1
+	IRQ_STUB irq \TARGET
+  irq=irq+1
+ .endr
+.endm
+
+/* We intercept every interrupt, because we may need to switch back to
+ * host.  Unfortunately we can't tell them apart except by entry
+ * point, so we need 256 entry points.
+ */
+.data
+.global default_idt_entries
+default_idt_entries:
+.text
+	IRQ_STUBS 0 1 return_to_host		/* First two traps */
+	IRQ_STUB 2 handle_nmi			/* NMI */
+	IRQ_STUBS 3 31 return_to_host		/* Rest of traps */
+	IRQ_STUBS 32 127 deliver_to_host	/* Real interrupts */
+	IRQ_STUB 128 return_to_host		/* System call (overridden) */
+	IRQ_STUBS 129 255 deliver_to_host	/* Other real interrupts */
+
+/* We ignore NMI and return. */
+handle_nmi:
+	addl	$8, %esp
+	iret
+
+ENTRY(end_switcher_text)
diff --git a/include/asm-i386/tsc.h b/include/asm-i386/tsc.h
index 62c091ffcccc..a4d806610b7f 100644
--- a/include/asm-i386/tsc.h
+++ b/include/asm-i386/tsc.h
@@ -63,6 +63,7 @@ extern void tsc_init(void);
 extern void mark_tsc_unstable(char *reason);
 extern int unsynchronized_tsc(void);
 extern void init_tsc_clocksource(void);
+int check_tsc_unstable(void);
 
 /*
  * Boot-time check whether the TSCs are synchronized across
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
index f30c04fc22b7..500aace21ca7 100644
--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -3,11 +3,6 @@
 #ifndef _ASM_LGUEST_H
 #define _ASM_LGUEST_H
 
-/* These are randomly chosen numbers which indicate we're an lguest at boot */
-#define LGUEST_MAGIC_EBP 0x4C687970
-#define LGUEST_MAGIC_EDI 0x652D4D65
-#define LGUEST_MAGIC_ESI 0xFFFFFFFF
-
 #ifndef __ASSEMBLY__
 #include <asm/irq.h>
 
@@ -20,7 +15,7 @@
 #define LHCALL_LOAD_IDT_ENTRY	6
 #define LHCALL_SET_STACK	7
 #define LHCALL_TS		8
-#define LHCALL_TIMER_READ	9
+#define LHCALL_SET_CLOCKEVENT	9
 #define LHCALL_HALT		10
 #define LHCALL_GET_WALLCLOCK	11
 #define LHCALL_BIND_DMA		12
@@ -29,6 +24,9 @@
 #define LHCALL_SET_PMD		15
 #define LHCALL_LOAD_TLS		16
 
+#define LG_CLOCK_MIN_DELTA	100UL
+#define LG_CLOCK_MAX_DELTA	ULONG_MAX
+
 #define LGUEST_TRAP_ENTRY 0x1F
 
 static inline unsigned long
@@ -75,6 +73,8 @@ struct lguest_data
 	unsigned long reserve_mem;
 	/* ID of this guest (used by network driver to set ethernet address) */
 	u16 guestid;
+	/* KHz for the TSC clock. */
+	u32 tsc_khz;
 
 /* Fields initialized by the guest at boot: */
 	/* Instruction range to suppress interrupts even if enabled */
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
new file mode 100644
index 000000000000..0ba414a40c80
--- /dev/null
+++ b/include/linux/lguest_launcher.h
@@ -0,0 +1,73 @@
+#ifndef _ASM_LGUEST_USER
+#define _ASM_LGUEST_USER
+/* Everything the "lguest" userspace program needs to know. */
+/* They can register up to 32 arrays of lguest_dma. */
+#define LGUEST_MAX_DMA		32
+/* At most we can dma 16 lguest_dma in one op. */
+#define LGUEST_MAX_DMA_SECTIONS	16
+
+/* How many devices?  Assume each one wants up to two dma arrays per device. */
+#define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2)
+
+struct lguest_dma
+{
+	/* 0 if free to be used, filled by hypervisor. */
+ 	u32 used_len;
+	unsigned long addr[LGUEST_MAX_DMA_SECTIONS];
+	u16 len[LGUEST_MAX_DMA_SECTIONS];
+};
+
+struct lguest_block_page
+{
+	/* 0 is a read, 1 is a write. */
+	int type;
+	u32 sector; 	/* Offset in device = sector * 512. */
+	u32 bytes;	/* Length expected to be read/written in bytes */
+	/* 0 = pending, 1 = done, 2 = done, error */
+	int result;
+	u32 num_sectors; /* Disk length = num_sectors * 512 */
+};
+
+/* There is a shared page of these. */
+struct lguest_net
+{
+	/* Simply the mac address (with multicast bit meaning promisc). */
+	unsigned char mac[6];
+};
+
+/* Where the Host expects the Guest to SEND_DMA console output to. */
+#define LGUEST_CONSOLE_DMA_KEY 0
+
+/* We have a page of these descriptors in the lguest_device page. */
+struct lguest_device_desc {
+	u16 type;
+#define LGUEST_DEVICE_T_CONSOLE	1
+#define LGUEST_DEVICE_T_NET	2
+#define LGUEST_DEVICE_T_BLOCK	3
+
+	u16 features;
+#define LGUEST_NET_F_NOCSUM		0x4000 /* Don't bother checksumming */
+#define LGUEST_DEVICE_F_RANDOMNESS	0x8000 /* IRQ is fairly random */
+
+	u16 status;
+/* 256 and above are device specific. */
+#define LGUEST_DEVICE_S_ACKNOWLEDGE	1 /* We have seen device. */
+#define LGUEST_DEVICE_S_DRIVER		2 /* We have found a driver */
+#define LGUEST_DEVICE_S_DRIVER_OK	4 /* Driver says OK! */
+#define LGUEST_DEVICE_S_REMOVED		8 /* Device has gone away. */
+#define LGUEST_DEVICE_S_REMOVED_ACK	16 /* Driver has been told. */
+#define LGUEST_DEVICE_S_FAILED		128 /* Something actually failed */
+
+	u16 num_pages;
+	u32 pfn;
+};
+
+/* Write command first word is a request. */
+enum lguest_req
+{
+	LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */
+	LHREQ_GETDMA, /* + addr (returns &lguest_dma, irq in ->used_len) */
+	LHREQ_IRQ, /* + irq */
+	LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
+};
+#endif /* _ASM_LGUEST_USER */
diff --git a/kernel/fork.c b/kernel/fork.c
index e7a2d995b087..469838998220 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -127,7 +127,6 @@ void __put_task_struct(struct task_struct *tsk)
 	if (!profile_handoff_task(tsk))
 		free_task(tsk);
 }
-EXPORT_SYMBOL_GPL(__put_task_struct);
 
 void __init fork_init(unsigned long mempages)
 {
-- 
cgit v1.3-8-gc7d7


From c0d121720220584bba2876b032e58a076b843fa1 Mon Sep 17 00:00:00 2001
From: Dave Jiang <djiang@mvista.com>
Date: Thu, 19 Jul 2007 01:49:46 -0700
Subject: drivers/edac: add new nmi rescan

Provides a way for NMI reported errors on x86 to notify the EDAC
subsystem pending ECC errors by writing to a software state variable.

Here's the reworked patch. I added an EDAC stub to the kernel so we can
have variables that are in the kernel even if EDAC is a module. I also
implemented the idea of using the chip driver to select error detection
mode via module parameter and eliminate the kernel compile option.
Please review/test. Thx!

Also, I only made changes to some of the chipset drivers since I am
unfamiliar with the other ones. We can add similar changes as we go.

Signed-off-by: Dave Jiang <djiang@mvista.com>
Signed-off-by: Douglas Thompson <dougthompson@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/kernel/traps.c   | 12 ++++++++++++
 arch/x86_64/kernel/traps.c | 11 +++++++++++
 drivers/edac/Kconfig       | 11 -----------
 drivers/edac/Makefile      |  2 +-
 drivers/edac/e752x_edac.c  | 14 +++++++++++++-
 drivers/edac/e7xxx_edac.c  | 14 ++++++++++++++
 drivers/edac/edac_mc.c     |  3 +++
 drivers/edac/edac_module.c | 24 ++++++++++++++++++++++--
 drivers/edac/edac_stub.c   | 42 ++++++++++++++++++++++++++++++++++++++++++
 drivers/edac/i5000_edac.c  | 13 +++++++++++++
 include/linux/edac.h       | 29 +++++++++++++++++++++++++++++
 11 files changed, 160 insertions(+), 15 deletions(-)
 create mode 100644 drivers/edac/edac_stub.c
 create mode 100644 include/linux/edac.h

(limited to 'include/linux')

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 109ebbcde585..3e7753c78b9b 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -41,6 +41,10 @@
 #include <linux/mca.h>
 #endif
 
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+
 #include <asm/processor.h>
 #include <asm/system.h>
 #include <asm/io.h>
@@ -638,6 +642,14 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
 	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
 		"CPU %d.\n", reason, smp_processor_id());
 	printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
+
+#if defined(CONFIG_EDAC)
+	if(edac_handler_set()) {
+		edac_atomic_assert_error();
+		return;
+	}
+#endif
+
 	if (panic_on_unrecovered_nmi)
                 panic("NMI: Not continuing");
 
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 74cbeb2e99a6..8713ad4a4db1 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -34,6 +34,10 @@
 #include <linux/bug.h>
 #include <linux/kdebug.h>
 
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+
 #include <asm/system.h>
 #include <asm/io.h>
 #include <asm/atomic.h>
@@ -719,6 +723,13 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
 		reason);
 	printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
 
+#if defined(CONFIG_EDAC)
+	if(edac_handler_set()) {
+		edac_atomic_assert_error();
+		return;
+	}
+#endif
+
 	if (panic_on_unrecovered_nmi)
 		panic("NMI: Not continuing");
 
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index e8c4a2bedaa1..3cfd9065a9b4 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -109,15 +109,4 @@ config EDAC_I5000
 	  Support for error detection and correction the Intel
 	  Greekcreek/Blackford chipsets.
 
-choice
-	prompt "Error detecting method"
-	default EDAC_POLL
-
-config EDAC_POLL
-	bool "Poll for errors"
-	help
-	  Poll the chipset periodically to detect errors.
-
-endchoice
-
 endif # EDAC
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 773472cef76e..19d5ac724098 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -5,9 +5,9 @@
 # This file may be distributed under the terms of the
 # GNU General Public License.
 #
-# $Id: Makefile,v 1.4.2.3 2005/07/08 22:05:38 dsp_llnl Exp $
 
 
+obj-$(CONFIG_EDAC)			:= edac_stub.o
 obj-$(CONFIG_EDAC_MM_EDAC)		+= edac_core.o
 
 edac_core-objs	:= edac_mc.o edac_device.o edac_mc_sysfs.o edac_pci_sysfs.o
diff --git a/drivers/edac/e752x_edac.c b/drivers/edac/e752x_edac.c
index 8bcc887692ab..f51e79a6f891 100644
--- a/drivers/edac/e752x_edac.c
+++ b/drivers/edac/e752x_edac.c
@@ -22,6 +22,7 @@
 #include <linux/pci.h>
 #include <linux/pci_ids.h>
 #include <linux/slab.h>
+#include <linux/edac.h>
 #include "edac_mc.h"
 
 #define E752X_REVISION	" Ver: 2.0.1 " __DATE__
@@ -948,6 +949,16 @@ static int e752x_probe1(struct pci_dev *pdev, int dev_idx)
 	debugf0("%s(): mci\n", __func__);
 	debugf0("Starting Probe1\n");
 
+	/* make sure error reporting method is sane */
+	switch(edac_op_state) {
+		case EDAC_OPSTATE_POLL:
+		case EDAC_OPSTATE_NMI:
+			break;
+		default:
+			edac_op_state = EDAC_OPSTATE_POLL;
+			break;
+	}
+
 	/* check to see if device 0 function 1 is enabled; if it isn't, we
 	 * assume the BIOS has reserved it for a reason and is expecting
 	 * exclusive access, we take care not to violate that assumption and
@@ -1123,4 +1134,5 @@ MODULE_DESCRIPTION("MC support for Intel e752x memory controllers");
 module_param(force_function_unhide, int, 0444);
 MODULE_PARM_DESC(force_function_unhide, "if BIOS sets Dev0:Fun1 up as hidden:"
 " 1=force unhide and hope BIOS doesn't fight driver for Dev0:Fun1 access");
-
+module_param(edac_op_state, int, 0444);
+MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
diff --git a/drivers/edac/e7xxx_edac.c b/drivers/edac/e7xxx_edac.c
index 310d91b41c96..0827b9a7b386 100644
--- a/drivers/edac/e7xxx_edac.c
+++ b/drivers/edac/e7xxx_edac.c
@@ -27,6 +27,7 @@
 #include <linux/pci.h>
 #include <linux/pci_ids.h>
 #include <linux/slab.h>
+#include <linux/edac.h>
 #include "edac_mc.h"
 
 #define	E7XXX_REVISION " Ver: 2.0.1 " __DATE__
@@ -419,6 +420,17 @@ static int e7xxx_probe1(struct pci_dev *pdev, int dev_idx)
 	struct e7xxx_error_info discard;
 
 	debugf0("%s(): mci\n", __func__);
+
+	/* make sure error reporting method is sane */
+	switch(edac_op_state) {
+		case EDAC_OPSTATE_POLL:
+		case EDAC_OPSTATE_NMI:
+			break;
+		default:
+			edac_op_state = EDAC_OPSTATE_POLL;
+			break;
+	}
+
 	pci_read_config_dword(pdev, E7XXX_DRC, &drc);
 
 	drc_chan = dual_channel_active(drc, dev_idx);
@@ -565,3 +577,5 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Linux Networx (http://lnxi.com) Thayne Harbaugh et al\n"
 	"Based on.work by Dan Hollis et al");
 MODULE_DESCRIPTION("MC support for Intel e7xxx memory controllers");
+module_param(edac_op_state, int, 0444);
+MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index eae1ca1caebd..81a28d6662e4 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -27,6 +27,7 @@
 #include <linux/list.h>
 #include <linux/sysdev.h>
 #include <linux/ctype.h>
+#include <linux/edac.h>
 #include <asm/uaccess.h>
 #include <asm/page.h>
 #include <asm/edac.h>
@@ -241,6 +242,7 @@ static int add_mc_to_global_list (struct mem_ctl_info *mci)
 	}
 
 	list_add_tail_rcu(&mci->link, insert_before);
+	atomic_inc(&edac_handlers);
 	return 0;
 
 fail0:
@@ -267,6 +269,7 @@ static void complete_mc_list_del(struct rcu_head *head)
 
 static void del_mc_from_global_list(struct mem_ctl_info *mci)
 {
+	atomic_dec(&edac_handlers);
 	list_del_rcu(&mci->link);
 	init_completion(&mci->complete);
 	call_rcu(&mci->rcu, complete_mc_list_del);
diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c
index 3cd3a236821c..89c96ecbf04e 100644
--- a/drivers/edac/edac_module.c
+++ b/drivers/edac/edac_module.c
@@ -1,6 +1,7 @@
 
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/edac.h>
 
 #include "edac_mc.h"
 #include "edac_module.h"
@@ -101,6 +102,25 @@ static void do_edac_check(void)
 	edac_pci_do_parity_check();
 }
 
+/*
+ * handler for EDAC to check if NMI type handler has asserted interrupt
+ */
+static int edac_assert_error_check_and_clear(void)
+{
+	int vreg;
+
+	if(edac_op_state == EDAC_OPSTATE_POLL)
+		return 1;
+
+	vreg = atomic_read(&edac_err_assert);
+	if(vreg) {
+		atomic_set(&edac_err_assert, 0);
+		return 1;
+	}
+
+	return 0;
+}
+
 /*
  * Action thread for EDAC to perform the POLL operations
  */
@@ -109,8 +129,8 @@ static int edac_kernel_thread(void *arg)
 	int msec;
 
 	while (!kthread_should_stop()) {
-
-		do_edac_check();
+		if(edac_assert_error_check_and_clear())
+			do_edac_check();
 
 		/* goto sleep for the interval */
 		msec = (HZ * edac_get_poll_msec()) / 1000;
diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c
new file mode 100644
index 000000000000..91a038d2f652
--- /dev/null
+++ b/drivers/edac/edac_stub.c
@@ -0,0 +1,42 @@
+/*
+ * common EDAC components that must be in kernel
+ *
+ * Author: Dave Jiang <djiang@mvista.com>
+ *
+ * 2007 (c) MontaVista Software, Inc. This file is licensed under
+ * the terms of the GNU General Public License version 2. This program
+ * is licensed "as is" without any warranty of any kind, whether express
+ * or implied.
+ *
+ */
+#include <linux/module.h>
+#include <linux/edac.h>
+#include <asm/atomic.h>
+#include <asm/edac.h>
+
+int edac_op_state = EDAC_OPSTATE_INVAL;
+EXPORT_SYMBOL(edac_op_state);
+
+atomic_t edac_handlers = ATOMIC_INIT(0);
+EXPORT_SYMBOL(edac_handlers);
+
+atomic_t edac_err_assert = ATOMIC_INIT(0);
+EXPORT_SYMBOL(edac_err_assert);
+
+inline int edac_handler_set(void)
+{
+	if (edac_op_state == EDAC_OPSTATE_POLL)
+		return 0;
+
+	return atomic_read(&edac_handlers);
+}
+EXPORT_SYMBOL(edac_handler_set);
+
+/*
+ * handler for NMI type of interrupts to assert error
+ */
+inline void edac_atomic_assert_error(void)
+{
+	atomic_set(&edac_err_assert, 1);
+}
+EXPORT_SYMBOL(edac_atomic_assert_error);
diff --git a/drivers/edac/i5000_edac.c b/drivers/edac/i5000_edac.c
index 4d7e786065aa..8eb8b6e5b32c 100644
--- a/drivers/edac/i5000_edac.c
+++ b/drivers/edac/i5000_edac.c
@@ -19,6 +19,7 @@
 #include <linux/pci.h>
 #include <linux/pci_ids.h>
 #include <linux/slab.h>
+#include <linux/edac.h>
 #include <asm/mmzone.h>
 
 #include "edac_mc.h"
@@ -1285,6 +1286,16 @@ static int i5000_probe1(struct pci_dev *pdev, int dev_idx)
 	if (PCI_FUNC(pdev->devfn) != 0)
 		return -ENODEV;
 
+	/* make sure error reporting method is sane */
+	switch(edac_op_state) {
+		case EDAC_OPSTATE_POLL:
+		case EDAC_OPSTATE_NMI:
+			break;
+		default:
+			edac_op_state = EDAC_OPSTATE_POLL;
+			break;
+	}
+
 	/* Ask the devices for the number of CSROWS and CHANNELS so
 	 * that we can calculate the memory resources, etc
 	 *
@@ -1475,3 +1486,5 @@ MODULE_AUTHOR
     ("Linux Networx (http://lnxi.com) Doug Thompson <norsk5@xmission.com>");
 MODULE_DESCRIPTION("MC Driver for Intel I5000 memory controllers - "
 		   I5000_REVISION);
+module_param(edac_op_state, int, 0444);
+MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
diff --git a/include/linux/edac.h b/include/linux/edac.h
new file mode 100644
index 000000000000..c8b92d79f884
--- /dev/null
+++ b/include/linux/edac.h
@@ -0,0 +1,29 @@
+/*
+ * Generic EDAC defs
+ *
+ * Author: Dave Jiang <djiang@mvista.com>
+ *
+ * 2006-2007 (c) MontaVista Software, Inc. This file is licensed under
+ * the terms of the GNU General Public License version 2. This program
+ * is licensed "as is" without any warranty of any kind, whether express
+ * or implied.
+ *
+ */
+#ifndef _LINUX_EDAC_H_
+#define _LINUX_EDAC_H_
+
+#include <asm/atomic.h>
+
+#define EDAC_OPSTATE_INVAL	-1
+#define EDAC_OPSTATE_POLL	0
+#define EDAC_OPSTATE_NMI	1
+#define EDAC_OPSTATE_INT	2
+
+extern int edac_op_state;
+extern atomic_t edac_handlers;
+extern atomic_t edac_err_assert;
+
+extern int edac_handler_set(void);
+extern void edac_atomic_assert_error(void);
+
+#endif
-- 
cgit v1.3-8-gc7d7


From 535c6a53035d8911f6b90455550c5fde0da7b866 Mon Sep 17 00:00:00 2001
From: Jason Uhlenkott <juhlenko@akamai.com>
Date: Thu, 19 Jul 2007 01:49:48 -0700
Subject: drivers/edac: new inte 30x0 MC driver

Here's a driver for the Intel 3000 and 3010 memory controllers,
relative to today's Sourceforge code drop.  This has only had light
testing (I've yet to actually see it handle a memory error) but it
detects my hardware correctly.

Signed-off-by: Jason Uhlenkott <juhlenko@akamai.com>
Signed-off-by: Douglas Thompson <dougthompson@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/edac/Kconfig      |   7 +
 drivers/edac/Makefile     |   1 +
 drivers/edac/i3000_edac.c | 493 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci_ids.h   |   1 +
 4 files changed, 502 insertions(+)
 create mode 100644 drivers/edac/i3000_edac.c

(limited to 'include/linux')

diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 3cfd9065a9b4..e8de70cb22c4 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -88,6 +88,13 @@ config EDAC_I82875P
 	  Support for error detection and correction on the Intel
 	  DP82785P and E7210 server chipsets.
 
+config EDAC_I3000
+	tristate "Intel 3000/3010"
+	depends on EDAC_MM_EDAC && PCI && X86_32
+	help
+	  Support for error detection and correction on the Intel
+	  3000 and 3010 server chipsets.
+
 config EDAC_I82860
 	tristate "Intel 82860"
 	depends on EDAC_MM_EDAC && PCI && X86_32
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 19d5ac724098..547ea135b64e 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_EDAC_E7XXX)		+= e7xxx_edac.o
 obj-$(CONFIG_EDAC_E752X)		+= e752x_edac.o
 obj-$(CONFIG_EDAC_I82443BXGX)		+= i82443bxgx_edac.o
 obj-$(CONFIG_EDAC_I82875P)		+= i82875p_edac.o
+obj-$(CONFIG_EDAC_I3000)		+= i3000_edac.o
 obj-$(CONFIG_EDAC_I82860)		+= i82860_edac.o
 obj-$(CONFIG_EDAC_R82600)		+= r82600_edac.o
 
diff --git a/drivers/edac/i3000_edac.c b/drivers/edac/i3000_edac.c
new file mode 100644
index 000000000000..570925d410a7
--- /dev/null
+++ b/drivers/edac/i3000_edac.c
@@ -0,0 +1,493 @@
+/*
+ * Intel 3000/3010 Memory Controller kernel module
+ * Copyright (C) 2007 Akamai Technologies, Inc.
+ * Shamelessly copied from:
+ * 	Intel D82875P Memory Controller kernel module
+ * 	(C) 2003 Linux Networx (http://lnxi.com)
+ *
+ * This file may be distributed under the terms of the
+ * GNU General Public License.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/slab.h>
+#include "edac_core.h"
+
+#define I3000_REVISION		"1.1"
+
+#define EDAC_MOD_STR		"i3000_edac"
+
+#define I3000_RANKS		8
+#define I3000_RANKS_PER_CHANNEL	4
+#define I3000_CHANNELS		2
+
+/* Intel 3000 register addresses - device 0 function 0 - DRAM Controller */
+
+#define I3000_MCHBAR		0x44	/* MCH Memory Mapped Register BAR */
+#define I3000_MCHBAR_MASK	0xffffc000
+#define I3000_MMR_WINDOW_SIZE	16384
+
+#define I3000_EDEAP		0x70	/* Extended DRAM Error Address Pointer (8b)
+					 *
+					 * 7:1   reserved
+					 * 0     bit 32 of address
+					 */
+#define I3000_DEAP		0x58	/* DRAM Error Address Pointer (32b)
+					 *
+					 * 31:7  address
+					 * 6:1   reserved
+					 * 0     Error channel 0/1
+					 */
+#define I3000_DEAP_GRAIN	(1 << 7)
+#define I3000_DEAP_PFN(edeap, deap)	((((edeap) & 1) << (32 - PAGE_SHIFT)) | \
+					((deap) >> PAGE_SHIFT))
+#define I3000_DEAP_OFFSET(deap)		((deap) & ~(I3000_DEAP_GRAIN-1) & ~PAGE_MASK)
+#define I3000_DEAP_CHANNEL(deap)	((deap) & 1)
+
+#define I3000_DERRSYN		0x5c	/* DRAM Error Syndrome (8b)
+					 *
+					 *  7:0  DRAM ECC Syndrome
+					 */
+
+#define I3000_ERRSTS		0xc8	/* Error Status Register (16b)
+					 *
+					 * 15:12 reserved
+					 * 11    MCH Thermal Sensor Event for SMI/SCI/SERR
+					 * 10    reserved
+					 *  9    LOCK to non-DRAM Memory Flag (LCKF)
+					 *  8    Received Refresh Timeout Flag (RRTOF)
+					 *  7:2  reserved
+					 *  1    Multiple-bit DRAM ECC Error Flag (DMERR)
+					 *  0    Single-bit DRAM ECC Error Flag (DSERR)
+					 */
+#define I3000_ERRSTS_BITS	0x0b03	/* bits which indicate errors */
+#define I3000_ERRSTS_UE		0x0002
+#define I3000_ERRSTS_CE		0x0001
+
+#define I3000_ERRCMD		0xca	/* Error Command (16b)
+					 *
+					 * 15:12 reserved
+					 * 11    SERR on MCH Thermal Sensor Event (TSESERR)
+					 * 10    reserved
+					 *  9    SERR on LOCK to non-DRAM Memory (LCKERR)
+					 *  8    SERR on DRAM Refresh Timeout (DRTOERR)
+					 *  7:2  reserved
+					 *  1    SERR Multiple-Bit DRAM ECC Error (DMERR)
+					 *  0    SERR on Single-Bit ECC Error (DSERR)
+					 */
+
+/* Intel  MMIO register space - device 0 function 0 - MMR space */
+
+#define I3000_DRB_SHIFT 25	/* 32MiB grain */
+
+#define I3000_C0DRB		0x100	/* Channel 0 DRAM Rank Boundary (8b x 4)
+					 *
+					 * 7:0   Channel 0 DRAM Rank Boundary Address
+					 */
+#define I3000_C1DRB		0x180	/* Channel 1 DRAM Rank Boundary (8b x 4)
+					 *
+					 * 7:0   Channel 1 DRAM Rank Boundary Address
+					 */
+
+#define I3000_C0DRA		0x108	/* Channel 0 DRAM Rank Attribute (8b x 2)
+					 *
+					 * 7     reserved
+					 * 6:4   DRAM odd Rank Attribute
+					 * 3     reserved
+					 * 2:0   DRAM even Rank Attribute
+					 *
+					 * Each attribute defines the page
+					 * size of the corresponding rank:
+					 *     000: unpopulated
+					 *     001: reserved
+					 *     010: 4 KB
+					 *     011: 8 KB
+					 *     100: 16 KB
+					 *     Others: reserved
+					 */
+#define I3000_C1DRA		0x188	/* Channel 1 DRAM Rank Attribute (8b x 2) */
+#define ODD_RANK_ATTRIB(dra) (((dra) & 0x70) >> 4)
+#define EVEN_RANK_ATTRIB(dra) ((dra) & 0x07)
+
+#define I3000_C0DRC0		0x120	/* DRAM Controller Mode 0 (32b)
+					 *
+					 * 31:30 reserved
+					 * 29    Initialization Complete (IC)
+					 * 28:11 reserved
+					 * 10:8  Refresh Mode Select (RMS)
+					 * 7     reserved
+					 * 6:4   Mode Select (SMS)
+					 * 3:2   reserved
+					 * 1:0   DRAM Type (DT)
+					 */
+
+#define I3000_C0DRC1		0x124	/* DRAM Controller Mode 1 (32b)
+					 *
+					 * 31    Enhanced Addressing Enable (ENHADE)
+					 * 30:0  reserved
+					 */
+
+
+enum i3000p_chips {
+	I3000 = 0,
+};
+
+struct i3000_dev_info {
+	const char *ctl_name;
+};
+
+struct i3000_error_info {
+	u16 errsts;
+	u8 derrsyn;
+	u8 edeap;
+	u32 deap;
+	u16 errsts2;
+};
+
+static const struct i3000_dev_info i3000_devs[] = {
+	[I3000] = {
+		     .ctl_name = "i3000"
+	},
+};
+
+static struct pci_dev *mci_pdev = NULL;
+static int i3000_registered = 1;
+
+static void i3000_get_error_info(struct mem_ctl_info *mci,
+		struct i3000_error_info *info)
+{
+	struct pci_dev *pdev;
+
+	pdev = to_pci_dev(mci->dev);
+
+	/*
+	 * This is a mess because there is no atomic way to read all the
+	 * registers at once and the registers can transition from CE being
+	 * overwritten by UE.
+	 */
+	pci_read_config_word(pdev, I3000_ERRSTS, &info->errsts);
+	if (!(info->errsts & I3000_ERRSTS_BITS))
+		return;
+	pci_read_config_byte(pdev, I3000_EDEAP, &info->edeap);
+	pci_read_config_dword(pdev, I3000_DEAP, &info->deap);
+	pci_read_config_byte(pdev, I3000_DERRSYN, &info->derrsyn);
+	pci_read_config_word(pdev, I3000_ERRSTS, &info->errsts2);
+
+	/*
+	 * If the error is the same for both reads then the first set
+	 * of reads is valid.  If there is a change then there is a CE
+	 * with no info and the second set of reads is valid and
+	 * should be UE info.
+	 */
+	if ((info->errsts ^ info->errsts2) & I3000_ERRSTS_BITS) {
+			pci_read_config_byte(pdev, I3000_EDEAP,
+					&info->edeap);
+			pci_read_config_dword(pdev, I3000_DEAP,
+					&info->deap);
+			pci_read_config_byte(pdev, I3000_DERRSYN,
+					&info->derrsyn);
+	}
+
+	/* Clear any error bits.
+	 * (Yes, we really clear bits by writing 1 to them.)
+	 */
+	pci_write_bits16(pdev, I3000_ERRSTS, I3000_ERRSTS_BITS, I3000_ERRSTS_BITS);
+}
+
+static int i3000_process_error_info(struct mem_ctl_info *mci,
+		struct i3000_error_info *info, int handle_errors)
+{
+	int row, multi_chan;
+	int pfn, offset, channel;
+
+	multi_chan = mci->csrows[0].nr_channels - 1;
+
+	if (!(info->errsts & I3000_ERRSTS_BITS))
+		return 0;
+
+	if (!handle_errors)
+		return 1;
+
+	if ((info->errsts ^ info->errsts2) & I3000_ERRSTS_BITS) {
+		edac_mc_handle_ce_no_info(mci, "UE overwrote CE");
+		info->errsts = info->errsts2;
+	}
+
+	pfn = I3000_DEAP_PFN(info->edeap, info->deap);
+	offset = I3000_DEAP_OFFSET(info->deap);
+	channel = I3000_DEAP_CHANNEL(info->deap);
+
+	row = edac_mc_find_csrow_by_page(mci, pfn);
+
+	if (info->errsts & I3000_ERRSTS_UE)
+		edac_mc_handle_ue(mci, pfn, offset, row, "i3000 UE");
+	else
+		edac_mc_handle_ce(mci, pfn, offset, info->derrsyn, row,
+				       multi_chan ? channel : 0,
+				       "i3000 CE");
+
+	return 1;
+}
+
+static void i3000_check(struct mem_ctl_info *mci)
+{
+	struct i3000_error_info info;
+
+	debugf1("MC%d: %s()\n", mci->mc_idx, __func__);
+	i3000_get_error_info(mci, &info);
+	i3000_process_error_info(mci, &info, 1);
+}
+
+static int i3000_is_interleaved(const unsigned char *c0dra,
+				const unsigned char *c1dra,
+				const unsigned char *c0drb,
+				const unsigned char *c1drb)
+{
+	int i;
+
+	/* If the channels aren't populated identically then
+	 * we're not interleaved.
+	 */
+	for (i = 0; i < I3000_RANKS_PER_CHANNEL / 2; i++)
+		if (ODD_RANK_ATTRIB(c0dra[i]) != ODD_RANK_ATTRIB(c1dra[i]) ||
+		    EVEN_RANK_ATTRIB(c0dra[i]) != EVEN_RANK_ATTRIB(c1dra[i]))
+			return 0;
+
+	/* If the rank boundaries for the two channels are different
+	 * then we're not interleaved.
+	 */
+	for (i = 0; i < I3000_RANKS_PER_CHANNEL; i++)
+		if (c0drb[i] != c1drb[i])
+			return 0;
+
+	return 1;
+}
+
+static int i3000_probe1(struct pci_dev *pdev, int dev_idx)
+{
+	int rc;
+	int i;
+	struct mem_ctl_info *mci = NULL;
+	unsigned long last_cumul_size;
+	int interleaved, nr_channels;
+	unsigned char dra[I3000_RANKS / 2], drb[I3000_RANKS];
+	unsigned char *c0dra = dra, *c1dra = &dra[I3000_RANKS_PER_CHANNEL / 2];
+	unsigned char *c0drb = drb, *c1drb = &drb[I3000_RANKS_PER_CHANNEL];
+	unsigned long mchbar;
+	void *window;
+
+	debugf0("MC: %s()\n", __func__);
+
+	pci_read_config_dword(pdev, I3000_MCHBAR, (u32 *)&mchbar);
+	mchbar &= I3000_MCHBAR_MASK;
+	window = ioremap_nocache(mchbar, I3000_MMR_WINDOW_SIZE);
+	if (!window) {
+		printk(KERN_ERR "i3000: cannot map mmio space at 0x%lx\n", mchbar);
+		return -ENODEV;
+	}
+
+	c0dra[0] = readb(window + I3000_C0DRA + 0); /* ranks 0,1 */
+	c0dra[1] = readb(window + I3000_C0DRA + 1); /* ranks 2,3 */
+	c1dra[0] = readb(window + I3000_C1DRA + 0); /* ranks 0,1 */
+	c1dra[1] = readb(window + I3000_C1DRA + 1); /* ranks 2,3 */
+
+	for (i = 0; i < I3000_RANKS_PER_CHANNEL; i++) {
+		c0drb[i] = readb(window + I3000_C0DRB + i);
+		c1drb[i] = readb(window + I3000_C1DRB + i);
+	}
+
+	iounmap(window);
+
+	/* Figure out how many channels we have.
+	 *
+	 * If we have what the datasheet calls "asymmetric channels"
+	 * (essentially the same as what was called "virtual single
+	 * channel mode" in the i82875) then it's a single channel as
+	 * far as EDAC is concerned.
+	 */
+	interleaved = i3000_is_interleaved(c0dra, c1dra, c0drb, c1drb);
+	nr_channels = interleaved ? 2 : 1;
+	mci = edac_mc_alloc(0, I3000_RANKS / nr_channels, nr_channels);
+	if (!mci)
+		return -ENOMEM;
+
+	debugf3("MC: %s(): init mci\n", __func__);
+
+	mci->dev = &pdev->dev;
+	mci->mtype_cap = MEM_FLAG_DDR2;
+
+	mci->edac_ctl_cap = EDAC_FLAG_SECDED;
+	mci->edac_cap = EDAC_FLAG_SECDED;
+
+	mci->mod_name = EDAC_MOD_STR;
+	mci->mod_ver = I3000_REVISION;
+	mci->ctl_name = i3000_devs[dev_idx].ctl_name;
+	mci->dev_name = pci_name(pdev);
+	mci->edac_check = i3000_check;
+	mci->ctl_page_to_phys = NULL;
+
+	/*
+	 * The dram rank boundary (DRB) reg values are boundary addresses
+	 * for each DRAM rank with a granularity of 32MB.  DRB regs are
+	 * cumulative; the last one will contain the total memory
+	 * contained in all ranks.
+	 *
+	 * If we're in interleaved mode then we're only walking through
+	 * the ranks of controller 0, so we double all the values we see.
+	 */
+	for (last_cumul_size = i = 0; i < mci->nr_csrows; i++) {
+		u8 value;
+		u32 cumul_size;
+		struct csrow_info *csrow = &mci->csrows[i];
+
+		value = drb[i];
+		cumul_size = value << (I3000_DRB_SHIFT - PAGE_SHIFT);
+		if (interleaved)
+			cumul_size <<= 1;
+		debugf3("MC: %s(): (%d) cumul_size 0x%x\n",
+			__func__, i, cumul_size);
+		if (cumul_size == last_cumul_size) {
+			csrow->mtype = MEM_EMPTY;
+			continue;
+		}
+
+		csrow->first_page = last_cumul_size;
+		csrow->last_page = cumul_size - 1;
+		csrow->nr_pages = cumul_size - last_cumul_size;
+		last_cumul_size = cumul_size;
+		csrow->grain = I3000_DEAP_GRAIN;
+		csrow->mtype = MEM_DDR2;
+		csrow->dtype = DEV_UNKNOWN;
+		csrow->edac_mode = EDAC_UNKNOWN;
+	}
+
+	/* Clear any error bits.
+	 * (Yes, we really clear bits by writing 1 to them.)
+	 */
+	pci_write_bits16(pdev, I3000_ERRSTS, I3000_ERRSTS_BITS, I3000_ERRSTS_BITS);
+
+	rc = -ENODEV;
+	if (edac_mc_add_mc(mci, 0)) {
+		debugf3("MC: %s(): failed edac_mc_add_mc()\n", __func__);
+		goto fail;
+	}
+
+	/* get this far and it's successful */
+	debugf3("MC: %s(): success\n", __func__);
+	return 0;
+
+      fail:
+	if (mci)
+		edac_mc_free(mci);
+
+	return rc;
+}
+
+/* returns count (>= 0), or negative on error */
+static int __devinit i3000_init_one(struct pci_dev *pdev,
+				      const struct pci_device_id *ent)
+{
+	int rc;
+
+	debugf0("MC: %s()\n", __func__);
+
+	if (pci_enable_device(pdev) < 0)
+		return -EIO;
+
+	rc = i3000_probe1(pdev, ent->driver_data);
+	if (mci_pdev == NULL)
+		mci_pdev = pci_dev_get(pdev);
+
+	return rc;
+}
+
+static void __devexit i3000_remove_one(struct pci_dev *pdev)
+{
+	struct mem_ctl_info *mci;
+
+	debugf0("%s()\n", __func__);
+
+	if ((mci = edac_mc_del_mc(&pdev->dev)) == NULL)
+		return;
+
+	edac_mc_free(mci);
+}
+
+static const struct pci_device_id i3000_pci_tbl[] __devinitdata = {
+	{
+		PCI_VEND_DEV(INTEL, 3000_HB), PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+	 	I3000
+	},
+	{
+		0,
+	}	/* 0 terminated list. */
+};
+
+MODULE_DEVICE_TABLE(pci, i3000_pci_tbl);
+
+static struct pci_driver i3000_driver = {
+	.name = EDAC_MOD_STR,
+	.probe = i3000_init_one,
+	.remove = __devexit_p(i3000_remove_one),
+	.id_table = i3000_pci_tbl,
+};
+
+static int __init i3000_init(void)
+{
+	int pci_rc;
+
+	debugf3("MC: %s()\n", __func__);
+	pci_rc = pci_register_driver(&i3000_driver);
+	if (pci_rc < 0)
+		goto fail0;
+
+	if (mci_pdev == NULL) {
+		i3000_registered = 0;
+		mci_pdev = pci_get_device(PCI_VENDOR_ID_INTEL,
+					  PCI_DEVICE_ID_INTEL_3000_HB, NULL);
+		if (!mci_pdev) {
+			debugf0("i3000 pci_get_device fail\n");
+			pci_rc = -ENODEV;
+			goto fail1;
+		}
+
+		pci_rc = i3000_init_one(mci_pdev, i3000_pci_tbl);
+		if (pci_rc < 0) {
+			debugf0("i3000 init fail\n");
+			pci_rc = -ENODEV;
+			goto fail1;
+		}
+	}
+
+	return 0;
+
+fail1:
+	pci_unregister_driver(&i3000_driver);
+
+fail0:
+	if (mci_pdev)
+		pci_dev_put(mci_pdev);
+
+	return pci_rc;
+}
+
+static void __exit i3000_exit(void)
+{
+	debugf3("MC: %s()\n", __func__);
+
+	pci_unregister_driver(&i3000_driver);
+	if (!i3000_registered) {
+		i3000_remove_one(mci_pdev);
+		pci_dev_put(mci_pdev);
+	}
+}
+
+module_init(i3000_init);
+module_exit(i3000_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Akamai Technologies Arthur Ulfeldt/Jason Uhlenkott");
+MODULE_DESCRIPTION("MC support for Intel 3000 memory hub controllers");
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 2c7add169539..7ec01b7525b6 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2209,6 +2209,7 @@
 #define PCI_DEVICE_ID_INTEL_82915GM_IG	0x2592
 #define PCI_DEVICE_ID_INTEL_82945G_HB	0x2770
 #define PCI_DEVICE_ID_INTEL_82945G_IG	0x2772
+#define PCI_DEVICE_ID_INTEL_3000_HB	0x2778
 #define PCI_DEVICE_ID_INTEL_82945GM_HB	0x27A0
 #define PCI_DEVICE_ID_INTEL_82945GM_IG	0x27A2
 #define PCI_DEVICE_ID_INTEL_ICH6_0	0x2640
-- 
cgit v1.3-8-gc7d7


From 66ee2f940ac8ab25f0c43a1e717d25dc46bfe74d Mon Sep 17 00:00:00 2001
From: Dave Jiang <djiang@mvista.com>
Date: Thu, 19 Jul 2007 01:49:54 -0700
Subject: drivers/edac: mod assert_error check

Change error check and clear variable from an atomic to an int

Signed-off-by: Dave Jiang <djiang@mvista.com>
Signed-off-by: Douglas Thompson <dougthompson@xmission.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/edac/edac_mc.c   | 11 ++++-------
 drivers/edac/edac_stub.c |  4 ++--
 include/linux/edac.h     |  2 +-
 3 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 3474ca9d90a4..7c952c68f0d6 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -222,18 +222,15 @@ static struct mem_ctl_info *find_mci_by_dev(struct device *dev)
  */
 static int edac_mc_assert_error_check_and_clear(void)
 {
-	int vreg;
+	int old_state;
 
 	if(edac_op_state == EDAC_OPSTATE_POLL)
 		return 1;
 
-	vreg = atomic_read(&edac_err_assert);
-	if(vreg) {
-		atomic_set(&edac_err_assert, 0);
-		return 1;
-	}
+	old_state = edac_err_assert;
+	edac_err_assert = 0;
 
-	return 0;
+	return old_state;
 }
 
 /*
diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c
index 91a038d2f652..3d259c231507 100644
--- a/drivers/edac/edac_stub.c
+++ b/drivers/edac/edac_stub.c
@@ -20,7 +20,7 @@ EXPORT_SYMBOL(edac_op_state);
 atomic_t edac_handlers = ATOMIC_INIT(0);
 EXPORT_SYMBOL(edac_handlers);
 
-atomic_t edac_err_assert = ATOMIC_INIT(0);
+int edac_err_assert = 0;
 EXPORT_SYMBOL(edac_err_assert);
 
 inline int edac_handler_set(void)
@@ -37,6 +37,6 @@ EXPORT_SYMBOL(edac_handler_set);
  */
 inline void edac_atomic_assert_error(void)
 {
-	atomic_set(&edac_err_assert, 1);
+	edac_err_assert++;
 }
 EXPORT_SYMBOL(edac_atomic_assert_error);
diff --git a/include/linux/edac.h b/include/linux/edac.h
index c8b92d79f884..eab451e69a91 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -20,8 +20,8 @@
 #define EDAC_OPSTATE_INT	2
 
 extern int edac_op_state;
+extern int edac_err_assert;
 extern atomic_t edac_handlers;
-extern atomic_t edac_err_assert;
 
 extern int edac_handler_set(void);
 extern void edac_atomic_assert_error(void);
-- 
cgit v1.3-8-gc7d7


From 53078ca84b1c01f36c306d1f52e2f88c7bb2f9e4 Mon Sep 17 00:00:00 2001
From: Douglas Thompson <dougthompson@xmission.com>
Date: Thu, 19 Jul 2007 01:50:17 -0700
Subject: include/linux/pci_id.h: add amd northbridge defines

pci_ids.h needs two of the AMD NB device-ids namely, Addressmap and the Memory
Controller devices

This patch adds those to the pci_id.h include file

Signed-off-by:	Douglas Thompson <dougthompson@xmission.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pci_ids.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 7ec01b7525b6..b15c6498fe67 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -495,6 +495,8 @@
 
 #define PCI_VENDOR_ID_AMD		0x1022
 #define PCI_DEVICE_ID_AMD_K8_NB		0x1100
+#define PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP	0x1101
+#define PCI_DEVICE_ID_AMD_K8_NB_MEMCTL	0x1102
 #define PCI_DEVICE_ID_AMD_K8_NB_MISC	0x1103
 #define PCI_DEVICE_ID_AMD_LANCE		0x2000
 #define PCI_DEVICE_ID_AMD_LANCE_HOME	0x2001
-- 
cgit v1.3-8-gc7d7


From e24b8cb4fa2bb779bdf48656152366b6f52f748f Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sun, 8 Jul 2007 14:26:37 +0200
Subject: i2c: Delete the i2c-isa pseudo bus driver

There are no users of i2c-isa left, so we can finally get rid of it.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 Documentation/feature-removal-schedule.txt |   9 --
 arch/arm/configs/badge4_defconfig          |   1 -
 arch/arm/configs/clps7500_defconfig        |   1 -
 arch/arm/configs/footbridge_defconfig      |   1 -
 arch/arm/configs/neponset_defconfig        |   1 -
 arch/arm/configs/picotux200_defconfig      |   1 -
 arch/arm/configs/rpc_defconfig             |   1 -
 arch/arm/configs/s3c2410_defconfig         |   1 -
 arch/m32r/m32104ut/defconfig.m32104ut      |   1 -
 arch/ppc/configs/ev64260_defconfig         |   1 -
 arch/ppc/configs/mpc8540_ads_defconfig     |   1 -
 arch/ppc/configs/mpc8548_cds_defconfig     |   1 -
 arch/ppc/configs/mpc8555_cds_defconfig     |   1 -
 arch/ppc/configs/mpc8560_ads_defconfig     |   1 -
 arch/ppc/configs/radstone_ppc7d_defconfig  |   1 -
 arch/ppc/configs/stx_gp3_defconfig         |   1 -
 arch/ppc/configs/sycamore_defconfig        |   1 -
 drivers/i2c/busses/Kconfig                 |   3 -
 drivers/i2c/busses/Makefile                |   1 -
 drivers/i2c/busses/i2c-isa.c               | 192 -----------------------------
 drivers/i2c/i2c-core.c                     |   2 -
 include/linux/i2c-isa.h                    |  36 ------
 include/linux/i2c.h                        |   1 -
 23 files changed, 260 deletions(-)
 delete mode 100644 drivers/i2c/busses/i2c-isa.c
 delete mode 100644 include/linux/i2c-isa.h

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 7d3f205b0ba5..df1d62001227 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -181,15 +181,6 @@ Who:	Kay Sievers <kay.sievers@suse.de>
 
 ---------------------------
 
-What:	i2c-isa
-When:	December 2006
-Why:	i2c-isa is a non-sense and doesn't fit in the device driver
-	model. Drivers relying on it are better implemented as platform
-	drivers.
-Who:	Jean Delvare <khali@linux-fr.org>
-
----------------------------
-
 What:	i2c_adapter.list
 When:	July 2007
 Why:	Superfluous, this list duplicates the one maintained by the driver
diff --git a/arch/arm/configs/badge4_defconfig b/arch/arm/configs/badge4_defconfig
index 821865f75605..b2bbf217c707 100644
--- a/arch/arm/configs/badge4_defconfig
+++ b/arch/arm/configs/badge4_defconfig
@@ -708,7 +708,6 @@ CONFIG_I2C_ALGOPCF=m
 # I2C Hardware Bus support
 #
 CONFIG_I2C_ELEKTOR=m
-# CONFIG_I2C_ISA is not set
 # CONFIG_I2C_PARPORT is not set
 # CONFIG_I2C_PARPORT_LIGHT is not set
 # CONFIG_I2C_STUB is not set
diff --git a/arch/arm/configs/clps7500_defconfig b/arch/arm/configs/clps7500_defconfig
index af9ae5389131..49e9f9d8b3d1 100644
--- a/arch/arm/configs/clps7500_defconfig
+++ b/arch/arm/configs/clps7500_defconfig
@@ -536,7 +536,6 @@ CONFIG_I2C_ALGOBIT=y
 # I2C Hardware Bus support
 #
 # CONFIG_I2C_ELEKTOR is not set
-# CONFIG_I2C_ISA is not set
 # CONFIG_I2C_PARPORT is not set
 # CONFIG_I2C_PARPORT_LIGHT is not set
 # CONFIG_I2C_PCA_ISA is not set
diff --git a/arch/arm/configs/footbridge_defconfig b/arch/arm/configs/footbridge_defconfig
index 2a612d23120b..299dc22294a0 100644
--- a/arch/arm/configs/footbridge_defconfig
+++ b/arch/arm/configs/footbridge_defconfig
@@ -748,7 +748,6 @@ CONFIG_I2C=m
 # CONFIG_I2C_ELEKTOR is not set
 # CONFIG_I2C_I801 is not set
 # CONFIG_I2C_I810 is not set
-# CONFIG_I2C_ISA is not set
 # CONFIG_I2C_NFORCE2 is not set
 # CONFIG_I2C_PARPORT is not set
 # CONFIG_I2C_PARPORT_LIGHT is not set
diff --git a/arch/arm/configs/neponset_defconfig b/arch/arm/configs/neponset_defconfig
index e86794a10fc0..92ccdc6492f7 100644
--- a/arch/arm/configs/neponset_defconfig
+++ b/arch/arm/configs/neponset_defconfig
@@ -698,7 +698,6 @@ CONFIG_I2C_ALGOBIT=y
 # I2C Hardware Bus support
 #
 # CONFIG_I2C_ELEKTOR is not set
-# CONFIG_I2C_ISA is not set
 # CONFIG_I2C_PARPORT_LIGHT is not set
 # CONFIG_I2C_STUB is not set
 # CONFIG_I2C_PCA_ISA is not set
diff --git a/arch/arm/configs/picotux200_defconfig b/arch/arm/configs/picotux200_defconfig
index 339c48953a62..3c0c4f192dc1 100644
--- a/arch/arm/configs/picotux200_defconfig
+++ b/arch/arm/configs/picotux200_defconfig
@@ -735,7 +735,6 @@ CONFIG_I2C_CHARDEV=m
 # I2C Hardware Bus support
 #
 CONFIG_I2C_AT91=m
-CONFIG_I2C_ISA=m
 # CONFIG_I2C_OCORES is not set
 # CONFIG_I2C_PARPORT_LIGHT is not set
 # CONFIG_I2C_STUB is not set
diff --git a/arch/arm/configs/rpc_defconfig b/arch/arm/configs/rpc_defconfig
index bc091264d354..8452dc8c7cc3 100644
--- a/arch/arm/configs/rpc_defconfig
+++ b/arch/arm/configs/rpc_defconfig
@@ -558,7 +558,6 @@ CONFIG_I2C_ALGOBIT=y
 #
 # I2C Hardware Bus support
 #
-# CONFIG_I2C_ISA is not set
 # CONFIG_I2C_PARPORT is not set
 # CONFIG_I2C_PARPORT_LIGHT is not set
 # CONFIG_I2C_STUB is not set
diff --git a/arch/arm/configs/s3c2410_defconfig b/arch/arm/configs/s3c2410_defconfig
index a850da377a29..1d5150e4d6b3 100644
--- a/arch/arm/configs/s3c2410_defconfig
+++ b/arch/arm/configs/s3c2410_defconfig
@@ -826,7 +826,6 @@ CONFIG_I2C_ALGOBIT=m
 # I2C Hardware Bus support
 #
 # CONFIG_I2C_ELEKTOR is not set
-CONFIG_I2C_ISA=m
 # CONFIG_I2C_OCORES is not set
 # CONFIG_I2C_PARPORT is not set
 # CONFIG_I2C_PARPORT_LIGHT is not set
diff --git a/arch/m32r/m32104ut/defconfig.m32104ut b/arch/m32r/m32104ut/defconfig.m32104ut
index 7b68fe8d921e..1f88f493a9e2 100644
--- a/arch/m32r/m32104ut/defconfig.m32104ut
+++ b/arch/m32r/m32104ut/defconfig.m32104ut
@@ -699,7 +699,6 @@ CONFIG_I2C_ALGOPCF=m
 # I2C Hardware Bus support
 #
 CONFIG_I2C_ELEKTOR=m
-CONFIG_I2C_ISA=m
 # CONFIG_I2C_OCORES is not set
 # CONFIG_I2C_PARPORT is not set
 # CONFIG_I2C_PARPORT_LIGHT is not set
diff --git a/arch/ppc/configs/ev64260_defconfig b/arch/ppc/configs/ev64260_defconfig
index 84cc142a67bb..587e9a3b9491 100644
--- a/arch/ppc/configs/ev64260_defconfig
+++ b/arch/ppc/configs/ev64260_defconfig
@@ -531,7 +531,6 @@ CONFIG_I2C_CHARDEV=m
 # CONFIG_I2C_AMD8111 is not set
 # CONFIG_I2C_I801 is not set
 # CONFIG_I2C_I810 is not set
-# CONFIG_I2C_ISA is not set
 # CONFIG_I2C_NFORCE2 is not set
 # CONFIG_I2C_PARPORT_LIGHT is not set
 # CONFIG_I2C_PIIX4 is not set
diff --git a/arch/ppc/configs/mpc8540_ads_defconfig b/arch/ppc/configs/mpc8540_ads_defconfig
index c5c86025e261..bf676ebd99ab 100644
--- a/arch/ppc/configs/mpc8540_ads_defconfig
+++ b/arch/ppc/configs/mpc8540_ads_defconfig
@@ -452,7 +452,6 @@ CONFIG_I2C_CHARDEV=y
 # CONFIG_I2C_AMD8111 is not set
 # CONFIG_I2C_I801 is not set
 # CONFIG_I2C_I810 is not set
-# CONFIG_I2C_ISA is not set
 CONFIG_I2C_MPC=y
 # CONFIG_I2C_NFORCE2 is not set
 # CONFIG_I2C_PARPORT_LIGHT is not set
diff --git a/arch/ppc/configs/mpc8548_cds_defconfig b/arch/ppc/configs/mpc8548_cds_defconfig
index abe034f24b83..f36fc5db540b 100644
--- a/arch/ppc/configs/mpc8548_cds_defconfig
+++ b/arch/ppc/configs/mpc8548_cds_defconfig
@@ -413,7 +413,6 @@ CONFIG_I2C_CHARDEV=y
 #
 # I2C Hardware Bus support
 #
-# CONFIG_I2C_ISA is not set
 CONFIG_I2C_MPC=y
 # CONFIG_I2C_PARPORT_LIGHT is not set
 # CONFIG_I2C_PCA_ISA is not set
diff --git a/arch/ppc/configs/mpc8555_cds_defconfig b/arch/ppc/configs/mpc8555_cds_defconfig
index 15abebf46b96..4f1e320acfbe 100644
--- a/arch/ppc/configs/mpc8555_cds_defconfig
+++ b/arch/ppc/configs/mpc8555_cds_defconfig
@@ -518,7 +518,6 @@ CONFIG_I2C_CHARDEV=y
 # CONFIG_I2C_I801 is not set
 # CONFIG_I2C_I810 is not set
 # CONFIG_I2C_PIIX4 is not set
-# CONFIG_I2C_ISA is not set
 CONFIG_I2C_MPC=y
 # CONFIG_I2C_NFORCE2 is not set
 # CONFIG_I2C_PARPORT_LIGHT is not set
diff --git a/arch/ppc/configs/mpc8560_ads_defconfig b/arch/ppc/configs/mpc8560_ads_defconfig
index f834fb541ad5..f12d48fcbba7 100644
--- a/arch/ppc/configs/mpc8560_ads_defconfig
+++ b/arch/ppc/configs/mpc8560_ads_defconfig
@@ -489,7 +489,6 @@ CONFIG_I2C_CHARDEV=y
 # CONFIG_I2C_I801 is not set
 # CONFIG_I2C_I810 is not set
 # CONFIG_I2C_PIIX4 is not set
-# CONFIG_I2C_ISA is not set
 CONFIG_I2C_MPC=y
 # CONFIG_I2C_NFORCE2 is not set
 # CONFIG_I2C_PARPORT_LIGHT is not set
diff --git a/arch/ppc/configs/radstone_ppc7d_defconfig b/arch/ppc/configs/radstone_ppc7d_defconfig
index ca4d1fd0ca05..9f64532f2a81 100644
--- a/arch/ppc/configs/radstone_ppc7d_defconfig
+++ b/arch/ppc/configs/radstone_ppc7d_defconfig
@@ -710,7 +710,6 @@ CONFIG_I2C_CHARDEV=y
 # CONFIG_I2C_I801 is not set
 # CONFIG_I2C_I810 is not set
 # CONFIG_I2C_PIIX4 is not set
-# CONFIG_I2C_ISA is not set
 # CONFIG_I2C_MPC is not set
 # CONFIG_I2C_NFORCE2 is not set
 # CONFIG_I2C_PARPORT_LIGHT is not set
diff --git a/arch/ppc/configs/stx_gp3_defconfig b/arch/ppc/configs/stx_gp3_defconfig
index 3fedc43e44ad..70d6f842aa9b 100644
--- a/arch/ppc/configs/stx_gp3_defconfig
+++ b/arch/ppc/configs/stx_gp3_defconfig
@@ -661,7 +661,6 @@ CONFIG_I2C_ALGOBIT=m
 # CONFIG_I2C_I801 is not set
 # CONFIG_I2C_I810 is not set
 # CONFIG_I2C_PIIX4 is not set
-# CONFIG_I2C_ISA is not set
 # CONFIG_I2C_MPC is not set
 # CONFIG_I2C_NFORCE2 is not set
 # CONFIG_I2C_PARPORT is not set
diff --git a/arch/ppc/configs/sycamore_defconfig b/arch/ppc/configs/sycamore_defconfig
index 758114cfea5c..6996cca18f3e 100644
--- a/arch/ppc/configs/sycamore_defconfig
+++ b/arch/ppc/configs/sycamore_defconfig
@@ -461,7 +461,6 @@ CONFIG_I2C_CHARDEV=y
 # CONFIG_I2C_I801 is not set
 # CONFIG_I2C_I810 is not set
 # CONFIG_I2C_IBM_IIC is not set
-# CONFIG_I2C_ISA is not set
 # CONFIG_I2C_NFORCE2 is not set
 # CONFIG_I2C_PARPORT_LIGHT is not set
 # CONFIG_I2C_PIIX4 is not set
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 838dc1c19d61..c63bfa68e144 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -236,9 +236,6 @@ config I2C_IOP3XX
 	  This driver can also be built as a module.  If so, the module
 	  will be called i2c-iop3xx.
 
-config I2C_ISA
-	tristate
-
 config I2C_IXP4XX
 	tristate "IXP4xx GPIO-Based I2C Interface (DEPRECATED)"
 	depends on ARCH_IXP4XX
diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
index 14d1432f698b..b6a8037f1feb 100644
--- a/drivers/i2c/busses/Makefile
+++ b/drivers/i2c/busses/Makefile
@@ -18,7 +18,6 @@ obj-$(CONFIG_I2C_I801)		+= i2c-i801.o
 obj-$(CONFIG_I2C_I810)		+= i2c-i810.o
 obj-$(CONFIG_I2C_IBM_IIC)	+= i2c-ibm_iic.o
 obj-$(CONFIG_I2C_IOP3XX)	+= i2c-iop3xx.o
-obj-$(CONFIG_I2C_ISA)		+= i2c-isa.o
 obj-$(CONFIG_I2C_IXP2000)	+= i2c-ixp2000.o
 obj-$(CONFIG_I2C_IXP4XX)	+= i2c-ixp4xx.o
 obj-$(CONFIG_I2C_POWERMAC)	+= i2c-powermac.o
diff --git a/drivers/i2c/busses/i2c-isa.c b/drivers/i2c/busses/i2c-isa.c
deleted file mode 100644
index b0e1370075de..000000000000
--- a/drivers/i2c/busses/i2c-isa.c
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
-    i2c-isa.c - an i2c-core-like thing for ISA hardware monitoring chips
-    Copyright (C) 2005  Jean Delvare <khali@linux-fr.org>
-
-    Based on the i2c-isa pseudo-adapter from the lm_sensors project
-    Copyright (c) 1998, 1999  Frodo Looijaard <frodol@dds.nl> 
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
-
-/* This implements an i2c-core-like thing for ISA hardware monitoring
-   chips. Such chips are linked to the i2c subsystem for historical
-   reasons (because the early ISA hardware monitoring chips such as the
-   LM78 had both an I2C and an ISA interface). They used to be
-   registered with the main i2c-core, but as a first step in the
-   direction of a clean separation between I2C and ISA chip drivers,
-   we now have this separate core for ISA ones. It is significantly
-   more simple than the real one, of course, because we don't have to
-   handle multiple busses: there is only one (fake) ISA adapter.
-   It is worth noting that we still rely on i2c-core for some things
-   at the moment - but hopefully this won't last. */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/i2c.h>
-#include <linux/i2c-isa.h>
-#include <linux/platform_device.h>
-#include <linux/completion.h>
-
-/* Exported by i2c-core for i2c-isa only */
-extern void i2c_adapter_dev_release(struct device *dev);
-extern struct class i2c_adapter_class;
-
-static u32 isa_func(struct i2c_adapter *adapter);
-
-/* This is the actual algorithm we define */
-static const struct i2c_algorithm isa_algorithm = {
-	.functionality	= isa_func,
-};
-
-/* There can only be one... */
-static struct i2c_adapter isa_adapter = {
-	.owner		= THIS_MODULE,
-	.id		= I2C_HW_ISA,
-	.class          = I2C_CLASS_HWMON,
-	.algo		= &isa_algorithm,
-	.name		= "ISA main adapter",
-};
-
-/* We can't do a thing... */
-static u32 isa_func(struct i2c_adapter *adapter)
-{
-	return 0;
-}
-
-
-/* We implement an interface which resembles i2c_{add,del}_driver,
-   but for i2c-isa drivers. We don't have to remember and handle lists
-   of drivers and adapters so this is much more simple, of course. */
-
-int i2c_isa_add_driver(struct i2c_driver *driver)
-{
-	int res;
-
-	/* Add the driver to the list of i2c drivers in the driver core */
-	driver->driver.bus = &i2c_bus_type;
-	res = driver_register(&driver->driver);
-	if (res)
-		return res;
-	dev_dbg(&isa_adapter.dev, "Driver %s registered\n", driver->driver.name);
-
-	/* Now look for clients */
-	res = driver->attach_adapter(&isa_adapter);
-	if (res) {
-		dev_dbg(&isa_adapter.dev,
-			"Driver %s failed to attach adapter, unregistering\n",
-			driver->driver.name);
-		driver_unregister(&driver->driver);
-	}
-	return res;
-}
-
-int i2c_isa_del_driver(struct i2c_driver *driver)
-{
-	struct list_head *item, *_n;
-	struct i2c_client *client;
-	int res;
-
-	/* Detach all clients belonging to this one driver */
-	list_for_each_safe(item, _n, &isa_adapter.clients) {
-		client = list_entry(item, struct i2c_client, list);
-		if (client->driver != driver)
-			continue;
-		dev_dbg(&isa_adapter.dev, "Detaching client %s at 0x%x\n",
-			client->name, client->addr);
-		if ((res = driver->detach_client(client))) {
-			dev_err(&isa_adapter.dev, "Failed, driver "
-				"%s not unregistered!\n",
-				driver->driver.name);
-			return res;
-		}
-	}
-
-	/* Get the driver off the core list */
-	driver_unregister(&driver->driver);
-	dev_dbg(&isa_adapter.dev, "Driver %s unregistered\n", driver->driver.name);
-
-	return 0;
-}
-
-
-static int __init i2c_isa_init(void)
-{
-	int err;
-
-	mutex_init(&isa_adapter.clist_lock);
-	INIT_LIST_HEAD(&isa_adapter.clients);
-
-	isa_adapter.nr = ANY_I2C_ISA_BUS;
-	isa_adapter.dev.parent = &platform_bus;
-	sprintf(isa_adapter.dev.bus_id, "i2c-%d", isa_adapter.nr);
-	isa_adapter.dev.release = &i2c_adapter_dev_release;
-	isa_adapter.dev.class = &i2c_adapter_class;
-	err = device_register(&isa_adapter.dev);
-	if (err) {
-		printk(KERN_ERR "i2c-isa: Failed to register device\n");
-		goto exit;
-	}
-
-	dev_dbg(&isa_adapter.dev, "%s registered\n", isa_adapter.name);
-
-	return 0;
-
-exit:
-	return err;
-}
-
-static void __exit i2c_isa_exit(void)
-{
-#ifdef DEBUG
-	struct list_head  *item, *_n;
-	struct i2c_client *client = NULL;
-#endif
-
-	/* There should be no more active client */
-#ifdef DEBUG
-	dev_dbg(&isa_adapter.dev, "Looking for clients\n");
-	list_for_each_safe(item, _n, &isa_adapter.clients) {
-		client = list_entry(item, struct i2c_client, list);
-		dev_err(&isa_adapter.dev, "Driver %s still has an active "
-			"ISA client at 0x%x\n", client->driver->driver.name,
-			client->addr);
-	}
-	if (client != NULL)
-		return;
-#endif
-
-	/* Clean up the sysfs representation */
-	dev_dbg(&isa_adapter.dev, "Unregistering from sysfs\n");
-	init_completion(&isa_adapter.dev_released);
-	device_unregister(&isa_adapter.dev);
-
-	/* Wait for sysfs to drop all references */
-	dev_dbg(&isa_adapter.dev, "Waiting for sysfs completion\n");
-	wait_for_completion(&isa_adapter.dev_released);
-
-	dev_dbg(&isa_adapter.dev, "%s unregistered\n", isa_adapter.name);
-}
-
-EXPORT_SYMBOL(i2c_isa_add_driver);
-EXPORT_SYMBOL(i2c_isa_del_driver);
-
-MODULE_AUTHOR("Jean Delvare <khali@linux-fr.org>");
-MODULE_DESCRIPTION("ISA bus access through i2c");
-MODULE_LICENSE("GPL");
-
-module_init(i2c_isa_init);
-module_exit(i2c_isa_exit);
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 435925eba437..931f34592be9 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -286,7 +286,6 @@ void i2c_adapter_dev_release(struct device *dev)
 	struct i2c_adapter *adap = to_i2c_adapter(dev);
 	complete(&adap->dev_released);
 }
-EXPORT_SYMBOL_GPL(i2c_adapter_dev_release);	/* exported to i2c-isa */
 
 static ssize_t
 show_adapter_name(struct device *dev, struct device_attribute *attr, char *buf)
@@ -305,7 +304,6 @@ struct class i2c_adapter_class = {
 	.name			= "i2c-adapter",
 	.dev_attrs		= i2c_adapter_attrs,
 };
-EXPORT_SYMBOL_GPL(i2c_adapter_class);		/* exported to i2c-isa */
 
 static void i2c_scan_static_board_info(struct i2c_adapter *adapter)
 {
diff --git a/include/linux/i2c-isa.h b/include/linux/i2c-isa.h
deleted file mode 100644
index 67e3598c4cec..000000000000
--- a/include/linux/i2c-isa.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * i2c-isa.h - definitions for the i2c-isa pseudo-i2c-adapter interface
- *
- * Copyright (C) 2005 Jean Delvare <khali@linux-fr.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#ifndef _LINUX_I2C_ISA_H
-#define _LINUX_I2C_ISA_H
-
-#include <linux/i2c.h>
-
-extern int i2c_isa_add_driver(struct i2c_driver *driver);
-extern int i2c_isa_del_driver(struct i2c_driver *driver);
-
-/* Detect whether we are on the isa bus. This is only useful to hybrid
-   (i2c+isa) drivers. */
-#define i2c_is_isa_adapter(adapptr) \
-        ((adapptr)->id == I2C_HW_ISA)
-#define i2c_is_isa_client(clientptr) \
-        i2c_is_isa_adapter((clientptr)->adapter)
-
-#endif /* _LINUX_I2C_ISA_H */
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index cae7d618030c..47f40376a3c7 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -362,7 +362,6 @@ struct i2c_client_address_data {
 
 /* The numbers to use to set I2C bus address */
 #define ANY_I2C_BUS		0xffff
-#define ANY_I2C_ISA_BUS		9191
 
 
 /* ----- functions exported by i2c.o */
-- 
cgit v1.3-8-gc7d7


From be879c4e249a8875d7129f3b0c1bb62584dafbd8 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 11 Jul 2007 18:39:02 -0400
Subject: SUNRPC: move bkl locking and xdr proc invocation into a common helper

Since every invocation of xdr encode or decode functions takes the BKL now,
there's a lot of redundant lock_kernel/unlock_kernel pairs that we can pull
out into a common function.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xdr.h     | 16 ++++++++++++++++
 net/sunrpc/auth.c              | 13 ++-----------
 net/sunrpc/auth_gss/auth_gss.c | 21 +++++----------------
 3 files changed, 23 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 9e340fa23c06..c6b53d181bfa 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -12,6 +12,7 @@
 #include <linux/uio.h>
 #include <asm/byteorder.h>
 #include <linux/scatterlist.h>
+#include <linux/smp_lock.h>
 
 /*
  * Buffer adjustment
@@ -35,6 +36,21 @@ struct xdr_netobj {
  */
 typedef int	(*kxdrproc_t)(void *rqstp, __be32 *data, void *obj);
 
+/*
+ * We're still requiring the BKL in the xdr code until it's been
+ * more carefully audited, at which point this wrapper will become
+ * unnecessary.
+ */
+static inline int rpc_call_xdrproc(kxdrproc_t xdrproc, void *rqstp, __be32 *data, void *obj)
+{
+	int ret;
+
+	lock_kernel();
+	ret = xdrproc(rqstp, data, obj);
+	unlock_kernel();
+	return ret;
+}
+
 /*
  * Basic structure for transmission/reception of a client XDR message.
  * Features a header (for a linear buffer containing RPC headers
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 29a8ecc60928..1ea27559b1de 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -13,7 +13,6 @@
 #include <linux/errno.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/spinlock.h>
-#include <linux/smp_lock.h>
 
 #ifdef RPC_DEBUG
 # define RPCDBG_FACILITY	RPCDBG_AUTH
@@ -476,17 +475,13 @@ rpcauth_wrap_req(struct rpc_task *task, kxdrproc_t encode, void *rqstp,
 		__be32 *data, void *obj)
 {
 	struct rpc_cred *cred = task->tk_msg.rpc_cred;
-	int ret;
 
 	dprintk("RPC: %5u using %s cred %p to wrap rpc data\n",
 			task->tk_pid, cred->cr_ops->cr_name, cred);
 	if (cred->cr_ops->crwrap_req)
 		return cred->cr_ops->crwrap_req(task, encode, rqstp, data, obj);
 	/* By default, we encode the arguments normally. */
-	lock_kernel();
-	ret = encode(rqstp, data, obj);
-	unlock_kernel();
-	return ret;
+	return rpc_call_xdrproc(encode, rqstp, data, obj);
 }
 
 int
@@ -494,7 +489,6 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp,
 		__be32 *data, void *obj)
 {
 	struct rpc_cred *cred = task->tk_msg.rpc_cred;
-	int ret;
 
 	dprintk("RPC: %5u using %s cred %p to unwrap rpc data\n",
 			task->tk_pid, cred->cr_ops->cr_name, cred);
@@ -502,10 +496,7 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp,
 		return cred->cr_ops->crunwrap_resp(task, decode, rqstp,
 						   data, obj);
 	/* By default, we decode the arguments normally. */
-	lock_kernel();
-	ret = decode(rqstp, data, obj);
-	unlock_kernel();
-	return ret;
+	return rpc_call_xdrproc(decode, rqstp, data, obj);
 }
 
 int
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index abfda33bac64..4bbc59cc237c 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -43,7 +43,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/auth.h>
@@ -1000,9 +999,7 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 	offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
 	*p++ = htonl(rqstp->rq_seqno);
 
-	lock_kernel();
-	status = encode(rqstp, p, obj);
-	unlock_kernel();
+	status = rpc_call_xdrproc(encode, rqstp, p, obj);
 	if (status)
 		return status;
 
@@ -1096,9 +1093,7 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 	offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
 	*p++ = htonl(rqstp->rq_seqno);
 
-	lock_kernel();
-	status = encode(rqstp, p, obj);
-	unlock_kernel();
+	status = rpc_call_xdrproc(encode, rqstp, p, obj);
 	if (status)
 		return status;
 
@@ -1157,16 +1152,12 @@ gss_wrap_req(struct rpc_task *task,
 		/* The spec seems a little ambiguous here, but I think that not
 		 * wrapping context destruction requests makes the most sense.
 		 */
-		lock_kernel();
-		status = encode(rqstp, p, obj);
-		unlock_kernel();
+		status = rpc_call_xdrproc(encode, rqstp, p, obj);
 		goto out;
 	}
 	switch (gss_cred->gc_service) {
 		case RPC_GSS_SVC_NONE:
-			lock_kernel();
-			status = encode(rqstp, p, obj);
-			unlock_kernel();
+			status = rpc_call_xdrproc(encode, rqstp, p, obj);
 			break;
 		case RPC_GSS_SVC_INTEGRITY:
 			status = gss_wrap_req_integ(cred, ctx, encode,
@@ -1282,9 +1273,7 @@ gss_unwrap_resp(struct rpc_task *task,
 	cred->cr_auth->au_rslack = cred->cr_auth->au_verfsize + (p - savedp)
 						+ (savedlen - head->iov_len);
 out_decode:
-	lock_kernel();
-	status = decode(rqstp, p, obj);
-	unlock_kernel();
+	status = rpc_call_xdrproc(decode, rqstp, p, obj);
 out:
 	gss_put_ctx(ctx);
 	dprintk("RPC: %5u gss_unwrap_resp returning %d\n", task->tk_pid,
-- 
cgit v1.3-8-gc7d7


From 4fdc17b2a7f4d9db5b08e0f963d0027f714e4104 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 14 Jul 2007 15:39:57 -0400
Subject: NFS: Introduce struct nfs_removeargs+nfs_removeres

We need a common structure for setting up an unlink() rpc call in order to
fix the asynchronous unlink code.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs2xdr.c        | 19 ++++++++++++++++---
 fs/nfs/nfs3proc.c       | 34 +++++++++++++++++-----------------
 fs/nfs/nfs3xdr.c        | 24 ++++++++++++++++++++++--
 fs/nfs/nfs4proc.c       | 31 +++++++++++++++----------------
 fs/nfs/nfs4xdr.c        |  8 ++++----
 fs/nfs/proc.c           | 20 ++++++++++----------
 include/linux/nfs_xdr.h | 27 +++++++++++++++------------
 7 files changed, 99 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 7fcc78f2aa71..c5fce7567200 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -43,6 +43,7 @@
 #define NFS_entry_sz		(NFS_filename_sz+3)
 
 #define NFS_diropargs_sz	(NFS_fhandle_sz+NFS_filename_sz)
+#define NFS_removeargs_sz	(NFS_fhandle_sz+NFS_filename_sz)
 #define NFS_sattrargs_sz	(NFS_fhandle_sz+NFS_sattr_sz)
 #define NFS_readlinkargs_sz	(NFS_fhandle_sz)
 #define NFS_readargs_sz		(NFS_fhandle_sz+3)
@@ -66,7 +67,7 @@
  * Common NFS XDR functions as inlines
  */
 static inline __be32 *
-xdr_encode_fhandle(__be32 *p, struct nfs_fh *fhandle)
+xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fhandle)
 {
 	memcpy(p, fhandle->data, NFS2_FHSIZE);
 	return p + XDR_QUADLEN(NFS2_FHSIZE);
@@ -204,7 +205,7 @@ nfs_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs_sattrargs *args)
 
 /*
  * Encode directory ops argument
- * LOOKUP, REMOVE, RMDIR
+ * LOOKUP, RMDIR
  */
 static int
 nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args)
@@ -215,6 +216,18 @@ nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args)
 	return 0;
 }
 
+/*
+ * Encode REMOVE argument
+ */
+static int
+nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
+{
+	p = xdr_encode_fhandle(p, args->fh);
+	p = xdr_encode_array(p, args->name.name, args->name.len);
+	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+	return 0;
+}
+
 /*
  * Arguments to a READ call. Since we read data directly into the page
  * cache, we also set up the reply iovec here so that iov[1] points
@@ -705,7 +718,7 @@ struct rpc_procinfo	nfs_procedures[] = {
     PROC(READ,		readargs,	readres, 3),
     PROC(WRITE,		writeargs,	writeres, 4),
     PROC(CREATE,	createargs,	diropres, 0),
-    PROC(REMOVE,	diropargs,	stat, 0),
+    PROC(REMOVE,	removeargs,	stat, 0),
     PROC(RENAME,	renameargs,	stat, 0),
     PROC(LINK,		linkargs,	stat, 0),
     PROC(SYMLINK,	symlinkargs,	stat, 0),
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 814d886b6aa4..eac07f2c99dd 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -349,23 +349,23 @@ out:
 static int
 nfs3_proc_remove(struct inode *dir, struct qstr *name)
 {
-	struct nfs_fattr	dir_attr;
-	struct nfs3_diropargs	arg = {
-		.fh		= NFS_FH(dir),
-		.name		= name->name,
-		.len		= name->len
+	struct nfs_removeargs arg = {
+		.fh = NFS_FH(dir),
+		.name.len = name->len,
+		.name.name = name->name,
 	};
-	struct rpc_message	msg = {
-		.rpc_proc	= &nfs3_procedures[NFS3PROC_REMOVE],
-		.rpc_argp	= &arg,
-		.rpc_resp	= &dir_attr,
+	struct nfs_removeres res;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE],
+		.rpc_argp = &arg,
+		.rpc_resp = &res,
 	};
 	int			status;
 
 	dprintk("NFS call  remove %s\n", name->name);
-	nfs_fattr_init(&dir_attr);
+	nfs_fattr_init(&res.dir_attr);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-	nfs_post_op_update_inode(dir, &dir_attr);
+	nfs_post_op_update_inode(dir, &res.dir_attr);
 	dprintk("NFS reply remove: %d\n", status);
 	return status;
 }
@@ -374,17 +374,17 @@ static int
 nfs3_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, struct qstr *name)
 {
 	struct unlinkxdr {
-		struct nfs3_diropargs arg;
-		struct nfs_fattr res;
+		struct nfs_removeargs arg;
+		struct nfs_removeres res;
 	} *ptr;
 
 	ptr = kmalloc(sizeof(*ptr), GFP_KERNEL);
 	if (!ptr)
 		return -ENOMEM;
 	ptr->arg.fh = NFS_FH(dir->d_inode);
-	ptr->arg.name = name->name;
-	ptr->arg.len = name->len;
-	nfs_fattr_init(&ptr->res);
+	ptr->arg.name.name = name->name;
+	ptr->arg.name.len = name->len;
+	nfs_fattr_init(&ptr->res.dir_attr);
 	msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE];
 	msg->rpc_argp = &ptr->arg;
 	msg->rpc_resp = &ptr->res;
@@ -400,7 +400,7 @@ nfs3_proc_unlink_done(struct dentry *dir, struct rpc_task *task)
 	if (nfs3_async_handle_jukebox(task, dir->d_inode))
 		return 1;
 	if (msg->rpc_argp) {
-		dir_attr = (struct nfs_fattr*)msg->rpc_resp;
+		dir_attr = &((struct nfs_removeres*)msg->rpc_resp)->dir_attr;
 		nfs_post_op_update_inode(dir->d_inode, dir_attr);
 		kfree(msg->rpc_argp);
 	}
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index b4647a22f349..d9e08f0cf2a0 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -50,6 +50,7 @@
 
 #define NFS3_sattrargs_sz	(NFS3_fh_sz+NFS3_sattr_sz+3)
 #define NFS3_diropargs_sz	(NFS3_fh_sz+NFS3_filename_sz)
+#define NFS3_removeargs_sz	(NFS3_fh_sz+NFS3_filename_sz)
 #define NFS3_accessargs_sz	(NFS3_fh_sz+1)
 #define NFS3_readlinkargs_sz	(NFS3_fh_sz)
 #define NFS3_readargs_sz	(NFS3_fh_sz+3)
@@ -65,6 +66,7 @@
 
 #define NFS3_attrstat_sz	(1+NFS3_fattr_sz)
 #define NFS3_wccstat_sz		(1+NFS3_wcc_data_sz)
+#define NFS3_removeres_sz	(NFS3_wccstat_sz)
 #define NFS3_lookupres_sz	(1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
 #define NFS3_accessres_sz	(1+NFS3_post_op_attr_sz+1)
 #define NFS3_readlinkres_sz	(1+NFS3_post_op_attr_sz+1)
@@ -106,7 +108,7 @@ static struct {
  * Common NFS XDR functions as inlines
  */
 static inline __be32 *
-xdr_encode_fhandle(__be32 *p, struct nfs_fh *fh)
+xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fh)
 {
 	return xdr_encode_array(p, fh->data, fh->size);
 }
@@ -299,6 +301,18 @@ nfs3_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs3_diropargs *args)
 	return 0;
 }
 
+/*
+ * Encode REMOVE argument
+ */
+static int
+nfs3_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
+{
+	p = xdr_encode_fhandle(p, args->fh);
+	p = xdr_encode_array(p, args->name.name, args->name.len);
+	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+	return 0;
+}
+
 /*
  * Encode access() argument
  */
@@ -736,6 +750,12 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 	return status;
 }
 
+static int
+nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res)
+{
+	return nfs3_xdr_wccstat(req, p, &res->dir_attr);
+}
+
 /*
  * Decode LOOKUP reply
  */
@@ -1126,7 +1146,7 @@ struct rpc_procinfo	nfs3_procedures[] = {
   PROC(MKDIR,		mkdirargs,	createres, 0),
   PROC(SYMLINK,		symlinkargs,	createres, 0),
   PROC(MKNOD,		mknodargs,	createres, 0),
-  PROC(REMOVE,		diropargs,	wccstat, 0),
+  PROC(REMOVE,		removeargs,	removeres, 0),
   PROC(RMDIR,		diropargs,	wccstat, 0),
   PROC(RENAME,		renameargs,	renameres, 0),
   PROC(LINK,		linkargs,	linkres, 0),
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7ead63e065ab..23dc25dbc6fa 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1925,28 +1925,27 @@ out:
 static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
 {
 	struct nfs_server *server = NFS_SERVER(dir);
-	struct nfs4_remove_arg args = {
+	struct nfs_removeargs args = {
 		.fh = NFS_FH(dir),
-		.name = name,
+		.name.len = name->len,
+		.name.name = name->name,
 		.bitmask = server->attr_bitmask,
 	};
-	struct nfs_fattr dir_attr;
-	struct nfs4_remove_res	res = {
+	struct nfs_removeres res = {
 		.server = server,
-		.dir_attr = &dir_attr,
 	};
 	struct rpc_message msg = {
-		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_REMOVE],
-		.rpc_argp	= &args,
-		.rpc_resp	= &res,
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
 	};
 	int			status;
 
-	nfs_fattr_init(res.dir_attr);
+	nfs_fattr_init(&res.dir_attr);
 	status = rpc_call_sync(server->client, &msg, 0);
 	if (status == 0) {
 		update_changeattr(dir, &res.cinfo);
-		nfs_post_op_update_inode(dir, res.dir_attr);
+		nfs_post_op_update_inode(dir, &res.dir_attr);
 	}
 	return status;
 }
@@ -1964,9 +1963,8 @@ static int nfs4_proc_remove(struct inode *dir, struct qstr *name)
 }
 
 struct unlink_desc {
-	struct nfs4_remove_arg	args;
-	struct nfs4_remove_res	res;
-	struct nfs_fattr dir_attr;
+	struct nfs_removeargs args;
+	struct nfs_removeres res;
 };
 
 static int nfs4_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir,
@@ -1980,10 +1978,11 @@ static int nfs4_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir,
 		return -ENOMEM;
 	
 	up->args.fh = NFS_FH(dir->d_inode);
-	up->args.name = name;
+	up->args.name.len = name->len;
+	up->args.name.name = name->name;
 	up->args.bitmask = server->attr_bitmask;
 	up->res.server = server;
-	up->res.dir_attr = &up->dir_attr;
+	nfs_fattr_init(&up->res.dir_attr);
 	
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
 	msg->rpc_argp = &up->args;
@@ -1999,7 +1998,7 @@ static int nfs4_proc_unlink_done(struct dentry *dir, struct rpc_task *task)
 	if (msg->rpc_resp != NULL) {
 		up = container_of(msg->rpc_resp, struct unlink_desc, res);
 		update_changeattr(dir->d_inode, &up->res.cinfo);
-		nfs_post_op_update_inode(dir->d_inode, up->res.dir_attr);
+		nfs_post_op_update_inode(dir->d_inode, &up->res.dir_attr);
 		kfree(up);
 		msg->rpc_resp = NULL;
 		msg->rpc_argp = NULL;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7b73ca8be909..badd73b7ca12 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1435,7 +1435,7 @@ out:
 /*
  * Encode REMOVE request
  */
-static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs4_remove_arg *args)
+static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1447,7 +1447,7 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs
 	encode_compound_hdr(&xdr, &hdr);
 	if ((status = encode_putfh(&xdr, args->fh)) != 0)
 		goto out;
-	if ((status = encode_remove(&xdr, args->name)) != 0)
+	if ((status = encode_remove(&xdr, &args->name)) != 0)
 		goto out;
 	status = encode_getfattr(&xdr, args->bitmask);
 out:
@@ -3835,7 +3835,7 @@ out:
 /*
  * Decode REMOVE response
  */
-static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_remove_res *res)
+static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_removeres *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -3848,7 +3848,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re
 		goto out;
 	if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
 		goto out;
-	decode_getfattr(&xdr, res->dir_attr, res->server);
+	decode_getfattr(&xdr, &res->dir_attr, res->server);
 out:
 	return status;
 }
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 7be0ee2782cb..3b3eb692e0f4 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -272,14 +272,14 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 static int
 nfs_proc_remove(struct inode *dir, struct qstr *name)
 {
-	struct nfs_diropargs	arg = {
-		.fh		= NFS_FH(dir),
-		.name		= name->name,
-		.len		= name->len
+	struct nfs_removeargs arg = {
+		.fh = NFS_FH(dir),
+		.name.len = name->len,
+		.name.name = name->name,
 	};
-	struct rpc_message	msg = { 
-		.rpc_proc	= &nfs_procedures[NFSPROC_REMOVE],
-		.rpc_argp	= &arg,
+	struct rpc_message msg = { 
+		.rpc_proc = &nfs_procedures[NFSPROC_REMOVE],
+		.rpc_argp = &arg,
 	};
 	int			status;
 
@@ -294,14 +294,14 @@ nfs_proc_remove(struct inode *dir, struct qstr *name)
 static int
 nfs_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, struct qstr *name)
 {
-	struct nfs_diropargs	*arg;
+	struct nfs_removeargs *arg;
 
 	arg = kmalloc(sizeof(*arg), GFP_KERNEL);
 	if (!arg)
 		return -ENOMEM;
 	arg->fh = NFS_FH(dir->d_inode);
-	arg->name = name->name;
-	arg->len = name->len;
+	arg->name.name = name->name;
+	arg->name.len = name->len;
 	msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE];
 	msg->rpc_argp = arg;
 	return 0;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 38d77681cf27..7babcb16300b 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -277,6 +277,21 @@ struct nfs_writeres {
 	const struct nfs_server *server;
 };
 
+/*
+ * Common arguments to the unlink call
+ */
+struct nfs_removeargs {
+	const struct nfs_fh	*fh;
+	struct qstr		name;
+	const u32 *		bitmask;
+};
+
+struct nfs_removeres {
+	const struct nfs_server *server;
+	struct nfs4_change_info	cinfo;
+	struct nfs_fattr	dir_attr;
+};
+
 /*
  * Argument struct for decode_entry function
  */
@@ -631,18 +646,6 @@ struct nfs4_readlink {
 	struct page **			pages;   /* zero-copy data */
 };
 
-struct nfs4_remove_arg {
-	const struct nfs_fh *		fh;
-	const struct qstr *		name;
-	const u32 *			bitmask;
-};
-
-struct nfs4_remove_res {
-	const struct nfs_server *	server;
-	struct nfs4_change_info		cinfo;
-	struct nfs_fattr *		dir_attr;
-};
-
 struct nfs4_rename_arg {
 	const struct nfs_fh *		old_dir;
 	const struct nfs_fh *		new_dir;
-- 
cgit v1.3-8-gc7d7


From e4eff1a622edd6ab7b73acd5d8763aa2fa3fee49 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 14 Jul 2007 15:39:58 -0400
Subject: SUNRPC: Clean up the sillyrename code

Fix a couple of bugs:
 - Don't rely on the parent dentry still being valid when the call completes.
   Fixes a race with shrink_dcache_for_umount_subtree()

 - Don't remove the file if the filehandle has been labelled as stale.

Fix a couple of inefficiencies
 - Remove the global list of sillyrenamed files. Instead we can cache the
   sillyrename information in the dentry->d_fsdata
 - Move common code from unlink_setup/unlink_done into fs/nfs/unlink.c

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c            |   4 +-
 fs/nfs/nfs3proc.c       |  38 +++-------
 fs/nfs/nfs4proc.c       |  50 ++++---------
 fs/nfs/proc.c           |  26 ++-----
 fs/nfs/unlink.c         | 195 +++++++++++++++++++++---------------------------
 include/linux/nfs_fs.h  |   4 +-
 include/linux/nfs_xdr.h |   5 +-
 7 files changed, 120 insertions(+), 202 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 0fa1dbcdadb9..ea97408e423e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -869,7 +869,7 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
 		lock_kernel();
 		drop_nlink(inode);
-		nfs_complete_unlink(dentry);
+		nfs_complete_unlink(dentry, inode);
 		unlock_kernel();
 	}
 	/* When creating a negative dentry, we want to renew d_time */
@@ -1411,7 +1411,7 @@ static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 		nfs_renew_times(dentry);
 		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 		d_move(dentry, sdentry);
-		error = nfs_async_unlink(dentry);
+		error = nfs_async_unlink(dir, dentry);
  		/* If we return 0 we don't unlink */
 	}
 	dput(sdentry);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index eac07f2c99dd..c7ca5d70870b 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -370,41 +370,21 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name)
 	return status;
 }
 
-static int
-nfs3_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, struct qstr *name)
+static void
+nfs3_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 {
-	struct unlinkxdr {
-		struct nfs_removeargs arg;
-		struct nfs_removeres res;
-	} *ptr;
-
-	ptr = kmalloc(sizeof(*ptr), GFP_KERNEL);
-	if (!ptr)
-		return -ENOMEM;
-	ptr->arg.fh = NFS_FH(dir->d_inode);
-	ptr->arg.name.name = name->name;
-	ptr->arg.name.len = name->len;
-	nfs_fattr_init(&ptr->res.dir_attr);
 	msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE];
-	msg->rpc_argp = &ptr->arg;
-	msg->rpc_resp = &ptr->res;
-	return 0;
 }
 
 static int
-nfs3_proc_unlink_done(struct dentry *dir, struct rpc_task *task)
+nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 {
-	struct rpc_message *msg = &task->tk_msg;
-	struct nfs_fattr	*dir_attr;
-
-	if (nfs3_async_handle_jukebox(task, dir->d_inode))
-		return 1;
-	if (msg->rpc_argp) {
-		dir_attr = &((struct nfs_removeres*)msg->rpc_resp)->dir_attr;
-		nfs_post_op_update_inode(dir->d_inode, dir_attr);
-		kfree(msg->rpc_argp);
-	}
-	return 0;
+	struct nfs_removeres *res;
+	if (nfs3_async_handle_jukebox(task, dir))
+		return 0;
+	res = task->tk_msg.rpc_resp;
+	nfs_post_op_update_inode(dir, &res->dir_attr);
+	return 1;
 }
 
 static int
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 23dc25dbc6fa..6ca2795ccd9c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1962,48 +1962,26 @@ static int nfs4_proc_remove(struct inode *dir, struct qstr *name)
 	return err;
 }
 
-struct unlink_desc {
-	struct nfs_removeargs args;
-	struct nfs_removeres res;
-};
-
-static int nfs4_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir,
-		struct qstr *name)
+static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 {
-	struct nfs_server *server = NFS_SERVER(dir->d_inode);
-	struct unlink_desc *up;
+	struct nfs_server *server = NFS_SERVER(dir);
+	struct nfs_removeargs *args = msg->rpc_argp;
+	struct nfs_removeres *res = msg->rpc_resp;
 
-	up = kmalloc(sizeof(*up), GFP_KERNEL);
-	if (!up)
-		return -ENOMEM;
-	
-	up->args.fh = NFS_FH(dir->d_inode);
-	up->args.name.len = name->len;
-	up->args.name.name = name->name;
-	up->args.bitmask = server->attr_bitmask;
-	up->res.server = server;
-	nfs_fattr_init(&up->res.dir_attr);
-	
+	args->bitmask = server->attr_bitmask;
+	res->server = server;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
-	msg->rpc_argp = &up->args;
-	msg->rpc_resp = &up->res;
-	return 0;
 }
 
-static int nfs4_proc_unlink_done(struct dentry *dir, struct rpc_task *task)
+static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 {
-	struct rpc_message *msg = &task->tk_msg;
-	struct unlink_desc *up;
-	
-	if (msg->rpc_resp != NULL) {
-		up = container_of(msg->rpc_resp, struct unlink_desc, res);
-		update_changeattr(dir->d_inode, &up->res.cinfo);
-		nfs_post_op_update_inode(dir->d_inode, &up->res.dir_attr);
-		kfree(up);
-		msg->rpc_resp = NULL;
-		msg->rpc_argp = NULL;
-	}
-	return 0;
+	struct nfs_removeres *res = task->tk_msg.rpc_resp;
+
+	if (nfs4_async_handle_error(task, res->server) == -EAGAIN)
+		return 0;
+	update_changeattr(dir, &res->cinfo);
+	nfs_post_op_update_inode(dir, &res->dir_attr);
+	return 1;
 }
 
 static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 3b3eb692e0f4..845cdde1d8b7 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -291,32 +291,16 @@ nfs_proc_remove(struct inode *dir, struct qstr *name)
 	return status;
 }
 
-static int
-nfs_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, struct qstr *name)
+static void
+nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 {
-	struct nfs_removeargs *arg;
-
-	arg = kmalloc(sizeof(*arg), GFP_KERNEL);
-	if (!arg)
-		return -ENOMEM;
-	arg->fh = NFS_FH(dir->d_inode);
-	arg->name.name = name->name;
-	arg->name.len = name->len;
 	msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE];
-	msg->rpc_argp = arg;
-	return 0;
 }
 
-static int
-nfs_proc_unlink_done(struct dentry *dir, struct rpc_task *task)
+static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 {
-	struct rpc_message *msg = &task->tk_msg;
-	
-	if (msg->rpc_argp) {
-		nfs_mark_for_revalidate(dir->d_inode);
-		kfree(msg->rpc_argp);
-	}
-	return 0;
+	nfs_mark_for_revalidate(dir);
+	return 1;
 }
 
 static int
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 0e28189c2151..045ab805c17f 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -3,7 +3,6 @@
  *
  * nfs sillydelete handling
  *
- * NOTE: we rely on holding the BKL for list manipulation protection.
  */
 
 #include <linux/slab.h>
@@ -15,46 +14,23 @@
 
 
 struct nfs_unlinkdata {
-	struct nfs_unlinkdata	*next;
-	struct dentry	*dir, *dentry;
-	struct qstr	name;
-	struct rpc_task	task;
+	struct nfs_removeargs args;
+	struct nfs_removeres res;
+	struct inode *dir;
 	struct rpc_cred	*cred;
-	unsigned int	count;
 };
 
-static struct nfs_unlinkdata	*nfs_deletes;
-static RPC_WAITQ(nfs_delete_queue, "nfs_delete_queue");
-
-/**
- * nfs_detach_unlinkdata - Remove asynchronous unlink from global list
- * @data: pointer to descriptor
- */
-static inline void
-nfs_detach_unlinkdata(struct nfs_unlinkdata *data)
-{
-	struct nfs_unlinkdata	**q;
-
-	for (q = &nfs_deletes; *q != NULL; q = &((*q)->next)) {
-		if (*q == data) {
-			*q = data->next;
-			break;
-		}
-	}
-}
-
 /**
- * nfs_put_unlinkdata - release data from a sillydelete operation.
+ * nfs_free_unlinkdata - release data from a sillydelete operation.
  * @data: pointer to unlink structure.
  */
 static void
-nfs_put_unlinkdata(struct nfs_unlinkdata *data)
+nfs_free_unlinkdata(struct nfs_unlinkdata *data)
 {
-	if (--data->count == 0) {
-		nfs_detach_unlinkdata(data);
-		kfree(data->name.name);
-		kfree(data);
-	}
+	iput(data->dir);
+	put_rpccred(data->cred);
+	kfree(data->args.name.name);
+	kfree(data);
 }
 
 #define NAME_ALLOC_LEN(len)	((len+16) & ~15)
@@ -63,50 +39,36 @@ nfs_put_unlinkdata(struct nfs_unlinkdata *data)
  * @dentry: pointer to dentry
  * @data: nfs_unlinkdata
  */
-static inline void
-nfs_copy_dname(struct dentry *dentry, struct nfs_unlinkdata *data)
+static int nfs_copy_dname(struct dentry *dentry, struct nfs_unlinkdata *data)
 {
 	char		*str;
 	int		len = dentry->d_name.len;
 
-	str = kmalloc(NAME_ALLOC_LEN(len), GFP_KERNEL);
+	str = kmemdup(dentry->d_name.name, NAME_ALLOC_LEN(len), GFP_KERNEL);
 	if (!str)
-		return;
-	memcpy(str, dentry->d_name.name, len);
-	if (!data->name.len) {
-		data->name.len = len;
-		data->name.name = str;
-	} else
-		kfree(str);
+		return -ENOMEM;
+	data->args.name.len = len;
+	data->args.name.name = str;
+	return 0;
 }
 
 /**
  * nfs_async_unlink_init - Initialize the RPC info
- * @task: rpc_task of the sillydelete
- *
- * We delay initializing RPC info until after the call to dentry_iput()
- * in order to minimize races against rename().
+ * task: rpc_task of the sillydelete
  */
 static void nfs_async_unlink_init(struct rpc_task *task, void *calldata)
 {
-	struct nfs_unlinkdata	*data = calldata;
-	struct dentry		*dir = data->dir;
-	struct rpc_message	msg = {
-		.rpc_cred	= data->cred,
+	struct nfs_unlinkdata *data = calldata;
+	struct inode *dir = data->dir;
+	struct rpc_message msg = {
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+		.rpc_cred = data->cred,
 	};
-	int			status = -ENOENT;
-
-	if (!data->name.len)
-		goto out_err;
 
-	status = NFS_PROTO(dir->d_inode)->unlink_setup(&msg, dir, &data->name);
-	if (status < 0)
-		goto out_err;
-	nfs_begin_data_update(dir->d_inode);
+	nfs_begin_data_update(dir);
+	NFS_PROTO(dir)->unlink_setup(&msg, dir);
 	rpc_call_setup(task, &msg, 0);
-	return;
- out_err:
-	rpc_exit(task, status);
 }
 
 /**
@@ -117,19 +79,13 @@ static void nfs_async_unlink_init(struct rpc_task *task, void *calldata)
  */
 static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
 {
-	struct nfs_unlinkdata	*data = calldata;
-	struct dentry		*dir = data->dir;
-	struct inode		*dir_i;
-
-	if (!dir)
-		return;
-	dir_i = dir->d_inode;
-	nfs_end_data_update(dir_i);
-	if (NFS_PROTO(dir_i)->unlink_done(dir, task))
-		return;
-	put_rpccred(data->cred);
-	data->cred = NULL;
-	dput(dir);
+	struct nfs_unlinkdata *data = calldata;
+	struct inode *dir = data->dir;
+
+	if (!NFS_PROTO(dir)->unlink_done(task, dir))
+		rpc_restart_call(task);
+	else
+		nfs_end_data_update(dir);
 }
 
 /**
@@ -142,7 +98,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
 static void nfs_async_unlink_release(void *calldata)
 {
 	struct nfs_unlinkdata	*data = calldata;
-	nfs_put_unlinkdata(data);
+	nfs_free_unlinkdata(data);
 }
 
 static const struct rpc_call_ops nfs_unlink_ops = {
@@ -151,73 +107,94 @@ static const struct rpc_call_ops nfs_unlink_ops = {
 	.rpc_release = nfs_async_unlink_release,
 };
 
+static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
+{
+	struct rpc_task *task;
+	struct dentry *parent;
+	struct inode *dir;
+
+	if (nfs_copy_dname(dentry, data) < 0)
+		goto out_free;
+
+	parent = dget_parent(dentry);
+	if (parent == NULL)
+		goto out_free;
+	dir = igrab(parent->d_inode);
+	dput(parent);
+	if (dir == NULL)
+		goto out_free;
+
+	data->dir = dir;
+	data->args.fh = NFS_FH(dir);
+	nfs_fattr_init(&data->res.dir_attr);
+
+	task = rpc_run_task(NFS_CLIENT(dir), RPC_TASK_ASYNC, &nfs_unlink_ops, data);
+	if (!IS_ERR(task))
+		rpc_put_task(task);
+	return 1;
+out_free:
+	return 0;
+}
+
 /**
  * nfs_async_unlink - asynchronous unlinking of a file
+ * @dir: parent directory of dentry
  * @dentry: dentry to unlink
  */
 int
-nfs_async_unlink(struct dentry *dentry)
+nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 {
-	struct dentry	*dir = dentry->d_parent;
-	struct nfs_unlinkdata	*data;
-	struct rpc_clnt	*clnt = NFS_CLIENT(dir->d_inode);
-	int		status = -ENOMEM;
+	struct nfs_unlinkdata *data;
+	int status = -ENOMEM;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data)
+	if (data == NULL)
 		goto out;
 
-	data->cred = rpcauth_lookupcred(clnt->cl_auth, 0);
+	data->cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0);
 	if (IS_ERR(data->cred)) {
 		status = PTR_ERR(data->cred);
 		goto out_free;
 	}
-	data->dir = dget(dir);
-	data->dentry = dentry;
-
-	data->next = nfs_deletes;
-	nfs_deletes = data;
-	data->count = 1;
-
-	rpc_init_task(&data->task, clnt, RPC_TASK_ASYNC, &nfs_unlink_ops, data);
 
+	status = -EBUSY;
 	spin_lock(&dentry->d_lock);
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+		goto out_unlock;
 	dentry->d_flags |= DCACHE_NFSFS_RENAMED;
+	dentry->d_fsdata = data;
 	spin_unlock(&dentry->d_lock);
-
-	rpc_sleep_on(&nfs_delete_queue, &data->task, NULL, NULL);
-	status = 0;
- out:
-	return status;
+	return 0;
+out_unlock:
+	spin_unlock(&dentry->d_lock);
+	put_rpccred(data->cred);
 out_free:
 	kfree(data);
+out:
 	return status;
 }
 
 /**
  * nfs_complete_unlink - Initialize completion of the sillydelete
  * @dentry: dentry to delete
+ * @inode: inode
  *
  * Since we're most likely to be called by dentry_iput(), we
  * only use the dentry to find the sillydelete. We then copy the name
  * into the qstr.
  */
 void
-nfs_complete_unlink(struct dentry *dentry)
+nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
 {
-	struct nfs_unlinkdata	*data;
+	struct nfs_unlinkdata	*data = NULL;
 
-	for(data = nfs_deletes; data != NULL; data = data->next) {
-		if (dentry == data->dentry)
-			break;
-	}
-	if (!data)
-		return;
-	data->count++;
-	nfs_copy_dname(dentry, data);
 	spin_lock(&dentry->d_lock);
-	dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+		dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+		data = dentry->d_fsdata;
+	}
 	spin_unlock(&dentry->d_lock);
-	rpc_wake_up_task(&data->task);
-	nfs_put_unlinkdata(data);
+
+	if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)))
+		nfs_free_unlinkdata(data);
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index c098ae194f79..9ba4aec37c50 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -407,8 +407,8 @@ extern void nfs_release_automount_timer(void);
 /*
  * linux/fs/nfs/unlink.c
  */
-extern int  nfs_async_unlink(struct dentry *);
-extern void nfs_complete_unlink(struct dentry *);
+extern int  nfs_async_unlink(struct inode *dir, struct dentry *dentry);
+extern void nfs_complete_unlink(struct dentry *dentry, struct inode *);
 
 /*
  * linux/fs/nfs/write.c
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7babcb16300b..cf74a4db84a5 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -791,9 +791,8 @@ struct nfs_rpc_ops {
 	int	(*create)  (struct inode *, struct dentry *,
 			    struct iattr *, int, struct nameidata *);
 	int	(*remove)  (struct inode *, struct qstr *);
-	int	(*unlink_setup)  (struct rpc_message *,
-			    struct dentry *, struct qstr *);
-	int	(*unlink_done) (struct dentry *, struct rpc_task *);
+	void	(*unlink_setup)  (struct rpc_message *, struct inode *dir);
+	int	(*unlink_done) (struct rpc_task *, struct inode *);
 	int	(*rename)  (struct inode *, struct qstr *,
 			    struct inode *, struct qstr *);
 	int	(*link)    (struct inode *, struct inode *, struct qstr *);
-- 
cgit v1.3-8-gc7d7


From e436d80085133858bf2613a630365e8a0459fd58 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 19 Jul 2007 21:28:35 +0200
Subject: [PATCH] sched: implement cpu_clock(cpu) high-speed time source

Implement the cpu_clock(cpu) interface for kernel-internal use:
high-speed (but slightly incorrect) per-cpu clock constructed from
sched_clock().

This API, unused at the moment, will be used in the future by blktrace,
by the softlockup-watchdog, by printk and by lockstat.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  7 +++++++
 kernel/sched.c        | 17 +++++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 94f624aef017..33b9b4841ee7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1348,6 +1348,13 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 #endif
 
 extern unsigned long long sched_clock(void);
+
+/*
+ * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
+ * clock constructed from sched_clock():
+ */
+extern unsigned long long cpu_clock(int cpu);
+
 extern unsigned long long
 task_sched_runtime(struct task_struct *task);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index a35a92ff38fd..93cf241cfbe9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -379,6 +379,23 @@ static inline unsigned long long rq_clock(struct rq *rq)
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+/*
+ * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
+ * clock constructed from sched_clock():
+ */
+unsigned long long cpu_clock(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long long now;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	now = rq_clock(rq);
+	spin_unlock_irqrestore(&rq->lock, flags);
+
+	return now;
+}
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /* Change a task's ->cfs_rq if it moves across CPUs */
 static inline void set_task_cfs_rq(struct task_struct *p)
-- 
cgit v1.3-8-gc7d7


From 626ac545c12e5f9bffe93086d1d03d26c99987ea Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Tue, 17 Jul 2007 15:28:17 -0400
Subject: user namespace: fix copy_user_ns return value

When a CONFIG_USER_NS=n and a user tries to unshare some namespace other
than the user namespace, the dummy copy_user_ns returns NULL rather than
the old_ns.

This value then gets assigned to task->nsproxy->user_ns, so that a
subsequent setuid, which uses task->nsproxy->user_ns, causes a NULL
pointer deref.

Fix this by returning old_ns.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/user_namespace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index bb320573bb9e..1101b0ce878f 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -49,7 +49,7 @@ static inline struct user_namespace *copy_user_ns(int flags,
 	if (flags & CLONE_NEWUSER)
 		return ERR_PTR(-EINVAL);
 
-	return NULL;
+	return old_ns;
 }
 
 static inline void put_user_ns(struct user_namespace *ns)
-- 
cgit v1.3-8-gc7d7


From 342cdb6d4739cee430efc3eafcacd1605db66036 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 20 Jul 2007 01:11:55 +0200
Subject: ide: make ide_get_best_pio_mode() print info if overriding PIO mode

* Print info about overriding PIO mode in ide_get_best_pio_mode().

* Remove info about overriding PIO mode from cmd64{0,x} host drivers.

* Remove no longer needed ide_pio_data_t.overridden field.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-lib.c    | 10 +++++++---
 drivers/ide/pci/cmd640.c |  7 +++----
 drivers/ide/pci/cmd64x.c |  5 ++---
 include/linux/ide.h      |  1 -
 4 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index 074bb32a4a40..d45bbad9ffe7 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -277,7 +277,7 @@ u8 ide_get_best_pio_mode (ide_drive_t *drive, u8 mode_wanted, u8 max_mode, ide_p
 	} else if (!drive->id) {
 		pio_mode = 0;
 	} else if ((pio_mode = ide_scan_pio_blacklist(id->model)) != -1) {
-		overridden = 1;
+		printk(KERN_INFO "%s: is on PIO blacklist\n", drive->name);
 		use_iordy = (pio_mode > 2);
 	} else {
 		pio_mode = id->tPIO;
@@ -303,12 +303,17 @@ u8 ide_get_best_pio_mode (ide_drive_t *drive, u8 mode_wanted, u8 max_mode, ide_p
 			}
 		}
 
+		if (overridden)
+			printk(KERN_INFO "%s: tPIO > 2, assuming tPIO = 2\n",
+					 drive->name);
+
 		/*
 		 * Conservative "downgrade" for all pre-ATA2 drives
 		 */
 		if (pio_mode && pio_mode < 4) {
 			pio_mode--;
-			overridden = 1;
+			printk(KERN_INFO "%s: applying conservative "
+					 "PIO \"downgrade\"\n", drive->name);
 			if (cycle_time && cycle_time < ide_pio_timings[pio_mode].cycle_time)
 				cycle_time = 0; /* use standard timing */
 		}
@@ -321,7 +326,6 @@ u8 ide_get_best_pio_mode (ide_drive_t *drive, u8 mode_wanted, u8 max_mode, ide_p
 		d->pio_mode = pio_mode;
 		d->cycle_time = cycle_time ? cycle_time : ide_pio_timings[pio_mode].cycle_time;
 		d->use_iordy = use_iordy;
-		d->overridden = overridden;
 	}
 	return pio_mode;
 }
diff --git a/drivers/ide/pci/cmd640.c b/drivers/ide/pci/cmd640.c
index dc43f009acab..dff21597e643 100644
--- a/drivers/ide/pci/cmd640.c
+++ b/drivers/ide/pci/cmd640.c
@@ -665,13 +665,12 @@ static void cmd640_tune_drive (ide_drive_t *drive, u8 mode_wanted)
 	(void) ide_get_best_pio_mode (drive, mode_wanted, 5, &d);
 	cmd640_set_mode (index, d.pio_mode, d.cycle_time);
 
-	printk ("%s: selected cmd640 PIO mode%d (%dns)%s",
+	printk("%s: selected cmd640 PIO mode%d (%dns)",
 		drive->name,
 		d.pio_mode,
-		d.cycle_time,
-		d.overridden ? " (overriding vendor mode)" : "");
+		d.cycle_time);
+
 	display_clocks(index);
-	return;
 }
 
 #endif /* CONFIG_BLK_DEV_CMD640_ENHANCED */
diff --git a/drivers/ide/pci/cmd64x.c b/drivers/ide/pci/cmd64x.c
index 1e89dd6e5bbf..5171e94facdc 100644
--- a/drivers/ide/pci/cmd64x.c
+++ b/drivers/ide/pci/cmd64x.c
@@ -227,9 +227,8 @@ static u8 cmd64x_tune_pio (ide_drive_t *drive, u8 mode_wanted)
 	static const u8 arttim_regs[4] = {ARTTIM0, ARTTIM1, ARTTIM23, ARTTIM23};
 	pio_mode = ide_get_best_pio_mode(drive, mode_wanted, 5, &pio);
 
-	cmdprintk("%s: PIO mode wanted %d, selected %d (%d ns)%s\n",
-		  drive->name, mode_wanted, pio_mode, pio.cycle_time,
-		  pio.overridden ? " (overriding vendor mode)" : "");
+	cmdprintk("%s: PIO mode wanted %d, selected %d (%d ns)\n",
+		  drive->name, mode_wanted, pio_mode, pio.cycle_time);
 
 	program_cycle_times(drive, pio.cycle_time,
 			    ide_pio_timings[pio_mode].active_time);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 19ab25804056..83a117d673c7 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1375,7 +1375,6 @@ typedef struct ide_pio_timings_s {
 typedef struct ide_pio_data_s {
 	u8 pio_mode;
 	u8 use_iordy;
-	u8 overridden;
 	unsigned int cycle_time;
 } ide_pio_data_t;
 
-- 
cgit v1.3-8-gc7d7


From 2229833c1365346b64357a9263fa724f74f5e376 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 20 Jul 2007 01:11:55 +0200
Subject: ide: add ide_dev_has_iordy() helper (take 4)

* Add ide_dev_has_iordy() helper and use it sl82c105 host driver.

* Remove no longer needed ide_pio_data_t.use_iordy field.

v2/v3:
* Fix issues noticed by Sergei:
  - correct patch description
  - fix comment in ide_get_best_pio_mode()

v4:
* Fix "ata_" prefix (Noticed by Jeff).

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-lib.c      |  7 +------
 drivers/ide/pci/sl82c105.c | 10 +++++++---
 include/linux/ide.h        |  6 +++++-
 3 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index d45bbad9ffe7..d5cc96bb4298 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -267,18 +267,15 @@ u8 ide_get_best_pio_mode (ide_drive_t *drive, u8 mode_wanted, u8 max_mode, ide_p
 {
 	int pio_mode;
 	int cycle_time = 0;
-	int use_iordy = 0;
 	struct hd_driveid* id = drive->id;
 	int overridden  = 0;
 
 	if (mode_wanted != 255) {
 		pio_mode = mode_wanted;
-		use_iordy = (pio_mode > 2);
 	} else if (!drive->id) {
 		pio_mode = 0;
 	} else if ((pio_mode = ide_scan_pio_blacklist(id->model)) != -1) {
 		printk(KERN_INFO "%s: is on PIO blacklist\n", drive->name);
-		use_iordy = (pio_mode > 2);
 	} else {
 		pio_mode = id->tPIO;
 		if (pio_mode > 2) {	/* 2 is maximum allowed tPIO value */
@@ -286,8 +283,7 @@ u8 ide_get_best_pio_mode (ide_drive_t *drive, u8 mode_wanted, u8 max_mode, ide_p
 			overridden = 1;
 		}
 		if (id->field_valid & 2) {	  /* drive implements ATA2? */
-			if (id->capability & 8) { /* drive supports use_iordy? */
-				use_iordy = 1;
+			if (id->capability & 8) { /* IORDY supported? */
 				cycle_time = id->eide_pio_iordy;
 				if (id->eide_pio_modes & 7) {
 					overridden = 0;
@@ -325,7 +321,6 @@ u8 ide_get_best_pio_mode (ide_drive_t *drive, u8 mode_wanted, u8 max_mode, ide_p
 	if (d) {
 		d->pio_mode = pio_mode;
 		d->cycle_time = cycle_time ? cycle_time : ide_pio_timings[pio_mode].cycle_time;
-		d->use_iordy = use_iordy;
 	}
 	return pio_mode;
 }
diff --git a/drivers/ide/pci/sl82c105.c b/drivers/ide/pci/sl82c105.c
index a7323d278c49..f2156ee7e2bd 100644
--- a/drivers/ide/pci/sl82c105.c
+++ b/drivers/ide/pci/sl82c105.c
@@ -52,9 +52,10 @@
  * Convert a PIO mode and cycle time to the required on/off times
  * for the interface.  This has protection against runaway timings.
  */
-static unsigned int get_pio_timings(ide_pio_data_t *p)
+static unsigned int get_pio_timings(ide_drive_t *drive, ide_pio_data_t *p)
 {
 	unsigned int cmd_on, cmd_off;
+	u8 iordy = 0;
 
 	cmd_on  = (ide_pio_timings[p->pio_mode].active_time + 29) / 30;
 	cmd_off = (p->cycle_time - 30 * cmd_on + 29) / 30;
@@ -65,7 +66,10 @@ static unsigned int get_pio_timings(ide_pio_data_t *p)
 	if (cmd_off == 0)
 		cmd_off = 1;
 
-	return (cmd_on - 1) << 8 | (cmd_off - 1) | (p->use_iordy ? 0x40 : 0x00);
+	if (p->pio_mode > 2 || ide_dev_has_iordy(drive->id))
+		iordy = 0x40;
+
+	return (cmd_on - 1) << 8 | (cmd_off - 1) | iordy;
 }
 
 /*
@@ -82,7 +86,7 @@ static u8 sl82c105_tune_pio(ide_drive_t *drive, u8 pio)
 
 	pio = ide_get_best_pio_mode(drive, pio, 5, &p);
 
-	drv_ctrl = get_pio_timings(&p);
+	drv_ctrl = get_pio_timings(drive, &p);
 
 	/*
 	 * Store the PIO timings so that we can restore them
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 83a117d673c7..349c22a1fbc5 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1363,6 +1363,11 @@ extern void ide_toggle_bounce(ide_drive_t *drive, int on);
 extern int ide_set_xfer_rate(ide_drive_t *drive, u8 rate);
 int ide_use_fast_pio(ide_drive_t *);
 
+static inline int ide_dev_has_iordy(struct hd_driveid *id)
+{
+	return ((id->field_valid & 2) && (id->capability & 8)) ? 1 : 0;
+}
+
 u8 ide_dump_status(ide_drive_t *, const char *, u8);
 
 typedef struct ide_pio_timings_s {
@@ -1374,7 +1379,6 @@ typedef struct ide_pio_timings_s {
 
 typedef struct ide_pio_data_s {
 	u8 pio_mode;
-	u8 use_iordy;
 	unsigned int cycle_time;
 } ide_pio_data_t;
 
-- 
cgit v1.3-8-gc7d7


From a5d8c5c834d3cabf4b7b477c3f6ee923c25026fc Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 20 Jul 2007 01:11:55 +0200
Subject: ide: add ide_pci_device_t.host_flags (take 2)

* Rename ide_pci_device_t.flags to ide_pci_device_t.host_flags
  and IDEPCI_FLAG_ISA_PORTS flag to IDE_HFLAG_ISA_PORTS.

* Add IDE_HFLAG_SINGLE flag for single channel devices.

* Convert core code and all IDE PCI drivers to use IDE_HFLAG_SINGLE
  and remove no longer needed ide_pci_device_t.channels field.

v2:
* Fix issues noticed by Sergei:
  - correct code alignment in scc_pata.c
  - s/IDE_HFLAG_SINGLE/~IDE_HFLAG_SINGLE/ in serverworks.c

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/pci/aec62xx.c      |  5 -----
 drivers/ide/pci/alim15x3.c     |  1 -
 drivers/ide/pci/amd74xx.c      |  2 --
 drivers/ide/pci/atiixp.c       |  3 +--
 drivers/ide/pci/cmd64x.c       |  4 ----
 drivers/ide/pci/cs5520.c       |  3 +--
 drivers/ide/pci/cs5530.c       |  1 -
 drivers/ide/pci/cs5535.c       |  2 +-
 drivers/ide/pci/cy82c693.c     |  2 +-
 drivers/ide/pci/generic.c      | 15 ---------------
 drivers/ide/pci/hpt34x.c       |  1 -
 drivers/ide/pci/hpt366.c       |  8 +-------
 drivers/ide/pci/it8213.c       |  2 +-
 drivers/ide/pci/it821x.c       |  1 -
 drivers/ide/pci/jmicron.c      |  1 -
 drivers/ide/pci/ns87415.c      |  1 -
 drivers/ide/pci/opti621.c      |  2 --
 drivers/ide/pci/pdc202xx_new.c |  7 -------
 drivers/ide/pci/pdc202xx_old.c |  5 -----
 drivers/ide/pci/piix.c         |  4 +---
 drivers/ide/pci/rz1000.c       |  1 -
 drivers/ide/pci/sc1200.c       |  1 -
 drivers/ide/pci/scc_pata.c     |  2 +-
 drivers/ide/pci/serverworks.c  | 16 ++++++++--------
 drivers/ide/pci/sgiioc4.c      |  2 +-
 drivers/ide/pci/siimage.c      |  1 -
 drivers/ide/pci/sis5513.c      |  1 -
 drivers/ide/pci/sl82c105.c     |  1 -
 drivers/ide/pci/slc90e66.c     |  1 -
 drivers/ide/pci/tc86c001.c     |  4 ++--
 drivers/ide/pci/triflex.c      |  1 -
 drivers/ide/pci/trm290.c       |  1 -
 drivers/ide/pci/via82cxxx.c    |  2 --
 drivers/ide/setup-pci.c        |  9 +++------
 include/linux/ide.h            |  6 +++---
 35 files changed, 25 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/pci/aec62xx.c b/drivers/ide/pci/aec62xx.c
index 6d18fef558bd..f8ac91c22e64 100644
--- a/drivers/ide/pci/aec62xx.c
+++ b/drivers/ide/pci/aec62xx.c
@@ -265,7 +265,6 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_aec62xx,
 		.init_chipset	= init_chipset_aec62xx,
 		.init_hwif	= init_hwif_aec62xx,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
 		.bootable	= OFF_BOARD,
@@ -275,7 +274,6 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_aec62xx,
 		.init_chipset	= init_chipset_aec62xx,
 		.init_hwif	= init_hwif_aec62xx,
-		.channels	= 2,
 		.autodma	= NOAUTODMA,
 		.bootable	= OFF_BOARD,
 		.udma_mask	= 0x1f, /* udma0-4 */
@@ -284,7 +282,6 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_aec62xx,
 		.init_chipset	= init_chipset_aec62xx,
 		.init_hwif	= init_hwif_aec62xx,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
 		.bootable	= NEVER_BOARD,
@@ -294,7 +291,6 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_aec6x80,
 		.init_chipset	= init_chipset_aec62xx,
 		.init_hwif	= init_hwif_aec62xx,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.udma_mask	= 0x3f, /* udma0-5 */
@@ -303,7 +299,6 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_aec6x80,
 		.init_chipset	= init_chipset_aec62xx,
 		.init_hwif	= init_hwif_aec62xx,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
 		.bootable	= OFF_BOARD,
diff --git a/drivers/ide/pci/alim15x3.c b/drivers/ide/pci/alim15x3.c
index ba0fb92b0417..1012da6e8a42 100644
--- a/drivers/ide/pci/alim15x3.c
+++ b/drivers/ide/pci/alim15x3.c
@@ -817,7 +817,6 @@ static ide_pci_device_t ali15x3_chipset __devinitdata = {
 	.init_chipset	= init_chipset_ali15x3,
 	.init_hwif	= init_hwif_ali15x3,
 	.init_dma	= init_dma_ali15x3,
-	.channels	= 2,
 	.autodma	= AUTODMA,
 	.bootable	= ON_BOARD,
 };
diff --git a/drivers/ide/pci/amd74xx.c b/drivers/ide/pci/amd74xx.c
index 8d30b99a54d8..9c3ea90aeb8b 100644
--- a/drivers/ide/pci/amd74xx.c
+++ b/drivers/ide/pci/amd74xx.c
@@ -448,7 +448,6 @@ static void __devinit init_hwif_amd74xx(ide_hwif_t *hwif)
 		.name		= name_str,				\
 		.init_chipset	= init_chipset_amd74xx,			\
 		.init_hwif	= init_hwif_amd74xx,			\
-		.channels	= 2,					\
 		.autodma	= AUTODMA,				\
 		.enablebits	= {{0x40,0x02,0x02}, {0x40,0x01,0x01}},	\
 		.bootable	= ON_BOARD,				\
@@ -459,7 +458,6 @@ static void __devinit init_hwif_amd74xx(ide_hwif_t *hwif)
 		.name		= name_str,				\
 		.init_chipset	= init_chipset_amd74xx,			\
 		.init_hwif	= init_hwif_amd74xx,			\
-		.channels	= 2,					\
 		.autodma	= AUTODMA,				\
 		.enablebits	= {{0x50,0x02,0x02}, {0x50,0x01,0x01}},	\
 		.bootable	= ON_BOARD,				\
diff --git a/drivers/ide/pci/atiixp.c b/drivers/ide/pci/atiixp.c
index bfdf086f4525..078adbe250d2 100644
--- a/drivers/ide/pci/atiixp.c
+++ b/drivers/ide/pci/atiixp.c
@@ -291,17 +291,16 @@ static ide_pci_device_t atiixp_pci_info[] __devinitdata = {
 	{	/* 0 */
 		.name		= "ATIIXP",
 		.init_hwif	= init_hwif_atiixp,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x48,0x01,0x00}, {0x48,0x08,0x00}},
 		.bootable	= ON_BOARD,
 	},{	/* 1 */
 		.name		= "SB600_PATA",
 		.init_hwif	= init_hwif_atiixp,
-		.channels	= 1,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x48,0x01,0x00}, {0x00,0x00,0x00}},
  		.bootable	= ON_BOARD,
+ 		.host_flags	= IDE_HFLAG_SINGLE,
  	},
 };
 
diff --git a/drivers/ide/pci/cmd64x.c b/drivers/ide/pci/cmd64x.c
index 5171e94facdc..8150a023dd7a 100644
--- a/drivers/ide/pci/cmd64x.c
+++ b/drivers/ide/pci/cmd64x.c
@@ -617,7 +617,6 @@ static ide_pci_device_t cmd64x_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_cmd64x,
 		.init_chipset	= init_chipset_cmd64x,
 		.init_hwif	= init_hwif_cmd64x,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x00,0x00,0x00}, {0x51,0x08,0x08}},
 		.bootable	= ON_BOARD,
@@ -627,7 +626,6 @@ static ide_pci_device_t cmd64x_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_cmd646,
 		.init_chipset	= init_chipset_cmd64x,
 		.init_hwif	= init_hwif_cmd64x,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x51,0x04,0x04}, {0x51,0x08,0x08}},
 		.bootable	= ON_BOARD,
@@ -637,7 +635,6 @@ static ide_pci_device_t cmd64x_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_cmd64x,
 		.init_chipset	= init_chipset_cmd64x,
 		.init_hwif	= init_hwif_cmd64x,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x51,0x04,0x04}, {0x51,0x08,0x08}},
 		.bootable	= ON_BOARD,
@@ -647,7 +644,6 @@ static ide_pci_device_t cmd64x_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_cmd64x,
 		.init_chipset	= init_chipset_cmd64x,
 		.init_hwif	= init_hwif_cmd64x,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x51,0x04,0x04}, {0x51,0x08,0x08}},
 		.bootable	= ON_BOARD,
diff --git a/drivers/ide/pci/cs5520.c b/drivers/ide/pci/cs5520.c
index 3b88a3a56116..5539a25eae99 100644
--- a/drivers/ide/pci/cs5520.c
+++ b/drivers/ide/pci/cs5520.c
@@ -194,10 +194,9 @@ static void __devinit init_hwif_cs5520(ide_hwif_t *hwif)
 		.name		= name_str,			\
 		.init_setup_dma = cs5520_init_setup_dma,	\
 		.init_hwif	= init_hwif_cs5520,		\
-		.channels	= 2,				\
 		.autodma	= AUTODMA,			\
 		.bootable	= ON_BOARD,			\
-		.flags		= IDEPCI_FLAG_ISA_PORTS,	\
+		.host_flags	= IDE_HFLAG_ISA_PORTS,		\
 	}
 
 static ide_pci_device_t cyrix_chipsets[] __devinitdata = {
diff --git a/drivers/ide/pci/cs5530.c b/drivers/ide/pci/cs5530.c
index b5c00d15a704..643d66cc6729 100644
--- a/drivers/ide/pci/cs5530.c
+++ b/drivers/ide/pci/cs5530.c
@@ -341,7 +341,6 @@ static ide_pci_device_t cs5530_chipset __devinitdata = {
 	.name		= "CS5530",
 	.init_chipset	= init_chipset_cs5530,
 	.init_hwif	= init_hwif_cs5530,
-	.channels	= 2,
 	.autodma	= AUTODMA,
 	.bootable	= ON_BOARD,
 };
diff --git a/drivers/ide/pci/cs5535.c b/drivers/ide/pci/cs5535.c
index 10f61f38243c..503979372144 100644
--- a/drivers/ide/pci/cs5535.c
+++ b/drivers/ide/pci/cs5535.c
@@ -228,9 +228,9 @@ static void __devinit init_hwif_cs5535(ide_hwif_t *hwif)
 static ide_pci_device_t cs5535_chipset __devinitdata = {
 	.name		= "CS5535",
 	.init_hwif	= init_hwif_cs5535,
-	.channels	= 1,
 	.autodma	= AUTODMA,
 	.bootable	= ON_BOARD,
+	.host_flags	= IDE_HFLAG_SINGLE,
 };
 
 static int __devinit cs5535_init_one(struct pci_dev *dev,
diff --git a/drivers/ide/pci/cy82c693.c b/drivers/ide/pci/cy82c693.c
index 103b9db97853..995b72563613 100644
--- a/drivers/ide/pci/cy82c693.c
+++ b/drivers/ide/pci/cy82c693.c
@@ -483,9 +483,9 @@ static ide_pci_device_t cy82c693_chipset __devinitdata = {
 	.init_chipset	= init_chipset_cy82c693,
 	.init_iops	= init_iops_cy82c693,
 	.init_hwif	= init_hwif_cy82c693,
-	.channels	= 1,
 	.autodma	= AUTODMA,
 	.bootable	= ON_BOARD,
+	.host_flags	= IDE_HFLAG_SINGLE,
 };
 
 static int __devinit cy82c693_init_one(struct pci_dev *dev, const struct pci_device_id *id)
diff --git a/drivers/ide/pci/generic.c b/drivers/ide/pci/generic.c
index 0d51a11e81da..48caa468b762 100644
--- a/drivers/ide/pci/generic.c
+++ b/drivers/ide/pci/generic.c
@@ -95,92 +95,77 @@ static ide_pci_device_t generic_chipsets[] __devinitdata = {
 	{	/* 0 */
 		.name		= "Unknown",
 		.init_hwif	= init_hwif_generic,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= ON_BOARD,
 	},{	/* 1 */
 		.name		= "NS87410",
 		.init_hwif	= init_hwif_generic,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x43,0x08,0x08}, {0x47,0x08,0x08}},
 		.bootable	= ON_BOARD,
         },{	/* 2 */
 		.name		= "SAMURAI",
 		.init_hwif	= init_hwif_generic,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= ON_BOARD,
 	},{	/* 3 */
 		.name		= "HT6565",
 		.init_hwif	= init_hwif_generic,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= ON_BOARD,
 	},{	/* 4 */
 		.name		= "UM8673F",
 		.init_hwif	= init_hwif_generic,
-		.channels	= 2,
 		.autodma	= NODMA,
 		.bootable	= ON_BOARD,
 	},{	/* 5 */
 		.name		= "UM8886A",
 		.init_hwif	= init_hwif_generic,
-		.channels	= 2,
 		.autodma	= NODMA,
 		.bootable	= ON_BOARD,
 	},{	/* 6 */
 		.name		= "UM8886BF",
 		.init_hwif	= init_hwif_generic,
-		.channels	= 2,
 		.autodma	= NODMA,
 		.bootable	= ON_BOARD,
 	},{	/* 7 */
 		.name		= "HINT_IDE",
 		.init_hwif	= init_hwif_generic,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= ON_BOARD,
 	},{	/* 8 */
 		.name		= "VIA_IDE",
 		.init_hwif	= init_hwif_generic,
-		.channels	= 2,
 		.autodma	= NOAUTODMA,
 		.bootable	= ON_BOARD,
 	},{	/* 9 */
 		.name		= "OPTI621V",
 		.init_hwif	= init_hwif_generic,
-		.channels	= 2,
 		.autodma	= NOAUTODMA,
 		.bootable	= ON_BOARD,
 	},{	/* 10 */
 		.name		= "VIA8237SATA",
 		.init_hwif	= init_hwif_generic,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 	},{	/* 11 */
 		.name 		= "Piccolo0102",
 		.init_hwif	= init_hwif_generic,
-		.channels	= 2,
 		.autodma	= NOAUTODMA,
 		.bootable	= ON_BOARD,
 	},{	/* 12 */
 		.name 		= "Piccolo0103",
 		.init_hwif	= init_hwif_generic,
-		.channels	= 2,
 		.autodma	= NOAUTODMA,
 		.bootable	= ON_BOARD,
 	},{	/* 13 */
 		.name 		= "Piccolo0105",
 		.init_hwif	= init_hwif_generic,
-		.channels	= 2,
 		.autodma	= NOAUTODMA,
 		.bootable	= ON_BOARD,
 	},{	/* 14 */
 		.name		= "Revolution",
 		.init_hwif	= init_hwif_generic,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 	}
diff --git a/drivers/ide/pci/hpt34x.c b/drivers/ide/pci/hpt34x.c
index 64f19743b127..6d2ef0ee0f2b 100644
--- a/drivers/ide/pci/hpt34x.c
+++ b/drivers/ide/pci/hpt34x.c
@@ -175,7 +175,6 @@ static ide_pci_device_t hpt34x_chipset __devinitdata = {
 	.name		= "HPT34X",
 	.init_chipset	= init_chipset_hpt34x,
 	.init_hwif	= init_hwif_hpt34x,
-	.channels	= 2,
 	.autodma	= NOAUTODMA,
 	.bootable	= NEVER_BOARD,
 	.extra		= 16
diff --git a/drivers/ide/pci/hpt366.c b/drivers/ide/pci/hpt366.c
index 81853d740aee..182346a04f36 100644
--- a/drivers/ide/pci/hpt366.c
+++ b/drivers/ide/pci/hpt366.c
@@ -1483,7 +1483,7 @@ static int __devinit init_setup_hpt366(struct pci_dev *dev, ide_pci_device_t *d)
 		 * to both functions -- really stupid design decision... :-(
 		 * Bit 4 is for the primary channel, bit 5 for the secondary.
 		 */
-		d->channels = 1;
+		d->host_flags |= IDE_HFLAG_SINGLE;
 		d->enablebits[0].mask = d->enablebits[0].val = 0x10;
 
 		d->udma_mask = HPT366_ALLOW_ATA66_3 ?
@@ -1546,7 +1546,6 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
 		.init_chipset	= init_chipset_hpt366,
 		.init_hwif	= init_hwif_hpt366,
 		.init_dma	= init_dma_hpt366,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
 		.bootable	= OFF_BOARD,
@@ -1557,7 +1556,6 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
 		.init_chipset	= init_chipset_hpt366,
 		.init_hwif	= init_hwif_hpt366,
 		.init_dma	= init_dma_hpt366,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
 		.udma_mask	= HPT372_ALLOW_ATA133_6 ? 0x7f : 0x3f,
@@ -1569,7 +1567,6 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
 		.init_chipset	= init_chipset_hpt366,
 		.init_hwif	= init_hwif_hpt366,
 		.init_dma	= init_dma_hpt366,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
 		.udma_mask	= HPT302_ALLOW_ATA133_6 ? 0x7f : 0x3f,
@@ -1581,7 +1578,6 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
 		.init_chipset	= init_chipset_hpt366,
 		.init_hwif	= init_hwif_hpt366,
 		.init_dma	= init_dma_hpt366,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
 		.udma_mask	= HPT371_ALLOW_ATA133_6 ? 0x7f : 0x3f,
@@ -1593,7 +1589,6 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
 		.init_chipset	= init_chipset_hpt366,
 		.init_hwif	= init_hwif_hpt366,
 		.init_dma	= init_dma_hpt366,
-		.channels	= 2,	/* 4 */
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
 		.udma_mask	= 0x3f,
@@ -1605,7 +1600,6 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
 		.init_chipset	= init_chipset_hpt366,
 		.init_hwif	= init_hwif_hpt366,
 		.init_dma	= init_dma_hpt366,
-		.channels	= 2,	/* 4 */
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
 		.udma_mask	= HPT372_ALLOW_ATA133_6 ? 0x7f : 0x3f,
diff --git a/drivers/ide/pci/it8213.c b/drivers/ide/pci/it8213.c
index ff48c23e571e..684b0ec79f41 100644
--- a/drivers/ide/pci/it8213.c
+++ b/drivers/ide/pci/it8213.c
@@ -272,10 +272,10 @@ static void __devinit init_hwif_it8213(ide_hwif_t *hwif)
 	{						\
 		.name		= name_str,		\
 		.init_hwif	= init_hwif_it8213,	\
-		.channels	= 1,			\
 		.autodma	= AUTODMA,		\
 		.enablebits	= {{0x41,0x80,0x80}}, \
 		.bootable	= ON_BOARD,		\
+		.host_flags	= IDE_HFLAG_SINGLE,	\
 	}
 
 static ide_pci_device_t it8213_chipsets[] __devinitdata = {
diff --git a/drivers/ide/pci/it821x.c b/drivers/ide/pci/it821x.c
index 8197b653ba1e..faccb2d4af43 100644
--- a/drivers/ide/pci/it821x.c
+++ b/drivers/ide/pci/it821x.c
@@ -718,7 +718,6 @@ static unsigned int __devinit init_chipset_it821x(struct pci_dev *dev, const cha
 		.name		= name_str,		\
 		.init_chipset	= init_chipset_it821x,	\
 		.init_hwif	= init_hwif_it821x,	\
-		.channels	= 2,			\
 		.autodma	= AUTODMA,		\
 		.bootable	= ON_BOARD,		\
 		.fixup	 	= it821x_fixups		\
diff --git a/drivers/ide/pci/jmicron.c b/drivers/ide/pci/jmicron.c
index a6008f63e71e..0edb9cd45854 100644
--- a/drivers/ide/pci/jmicron.c
+++ b/drivers/ide/pci/jmicron.c
@@ -177,7 +177,6 @@ fallback:
 	{						\
 		.name		= name_str,		\
 		.init_hwif	= init_hwif_jmicron,	\
-		.channels	= 2,			\
 		.autodma	= AUTODMA,		\
 		.bootable	= ON_BOARD,		\
 		.enablebits	= { {0x40, 1, 1}, {0x40, 0x10, 0x10} }, \
diff --git a/drivers/ide/pci/ns87415.c b/drivers/ide/pci/ns87415.c
index b310c4f51077..09941f37d635 100644
--- a/drivers/ide/pci/ns87415.c
+++ b/drivers/ide/pci/ns87415.c
@@ -281,7 +281,6 @@ static ide_pci_device_t ns87415_chipset __devinitdata = {
 	.init_iops	= init_iops_ns87415,
 #endif
 	.init_hwif	= init_hwif_ns87415,
-	.channels	= 2,
 	.autodma	= AUTODMA,
 	.bootable	= ON_BOARD,
 };
diff --git a/drivers/ide/pci/opti621.c b/drivers/ide/pci/opti621.c
index aede7eee9246..78d7adf2d0b0 100644
--- a/drivers/ide/pci/opti621.c
+++ b/drivers/ide/pci/opti621.c
@@ -350,14 +350,12 @@ static ide_pci_device_t opti621_chipsets[] __devinitdata = {
 	{	/* 0 */
 		.name		= "OPTI621",
 		.init_hwif	= init_hwif_opti621,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x45,0x80,0x00}, {0x40,0x08,0x00}},
 		.bootable	= ON_BOARD,
 	},{	/* 1 */
 		.name		= "OPTI621X",
 		.init_hwif	= init_hwif_opti621,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x45,0x80,0x00}, {0x40,0x08,0x00}},
 		.bootable	= ON_BOARD,
diff --git a/drivers/ide/pci/pdc202xx_new.c b/drivers/ide/pci/pdc202xx_new.c
index 06c5e93d37a2..f8508d9cccd3 100644
--- a/drivers/ide/pci/pdc202xx_new.c
+++ b/drivers/ide/pci/pdc202xx_new.c
@@ -566,7 +566,6 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_pdcnew,
 		.init_chipset	= init_chipset_pdcnew,
 		.init_hwif	= init_hwif_pdc202new,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.udma_mask	= 0x3f, /* udma0-5 */
@@ -575,7 +574,6 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_pdcnew,
 		.init_chipset	= init_chipset_pdcnew,
 		.init_hwif	= init_hwif_pdc202new,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.udma_mask	= 0x7f, /* udma0-6*/
@@ -584,7 +582,6 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_pdc20270,
 		.init_chipset	= init_chipset_pdcnew,
 		.init_hwif	= init_hwif_pdc202new,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.udma_mask	= 0x3f, /* udma0-5 */
@@ -593,7 +590,6 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_pdcnew,
 		.init_chipset	= init_chipset_pdcnew,
 		.init_hwif	= init_hwif_pdc202new,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.udma_mask	= 0x7f, /* udma0-6*/
@@ -602,7 +598,6 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_pdcnew,
 		.init_chipset	= init_chipset_pdcnew,
 		.init_hwif	= init_hwif_pdc202new,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.udma_mask	= 0x7f, /* udma0-6*/
@@ -611,7 +606,6 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_pdc20276,
 		.init_chipset	= init_chipset_pdcnew,
 		.init_hwif	= init_hwif_pdc202new,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.udma_mask	= 0x7f, /* udma0-6*/
@@ -620,7 +614,6 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_pdcnew,
 		.init_chipset	= init_chipset_pdcnew,
 		.init_hwif	= init_hwif_pdc202new,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.udma_mask	= 0x7f, /* udma0-6*/
diff --git a/drivers/ide/pci/pdc202xx_old.c b/drivers/ide/pci/pdc202xx_old.c
index 25cebfe18ddd..eb4e4a61d06e 100644
--- a/drivers/ide/pci/pdc202xx_old.c
+++ b/drivers/ide/pci/pdc202xx_old.c
@@ -441,7 +441,6 @@ static ide_pci_device_t pdc202xx_chipsets[] __devinitdata = {
 		.init_chipset	= init_chipset_pdc202xx,
 		.init_hwif	= init_hwif_pdc202xx,
 		.init_dma	= init_dma_pdc202xx,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.extra		= 16,
@@ -452,7 +451,6 @@ static ide_pci_device_t pdc202xx_chipsets[] __devinitdata = {
 		.init_chipset	= init_chipset_pdc202xx,
 		.init_hwif	= init_hwif_pdc202xx,
 		.init_dma	= init_dma_pdc202xx,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.extra		= 48,
@@ -463,7 +461,6 @@ static ide_pci_device_t pdc202xx_chipsets[] __devinitdata = {
 		.init_chipset	= init_chipset_pdc202xx,
 		.init_hwif	= init_hwif_pdc202xx,
 		.init_dma	= init_dma_pdc202xx,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.extra		= 48,
@@ -474,7 +471,6 @@ static ide_pci_device_t pdc202xx_chipsets[] __devinitdata = {
 		.init_chipset	= init_chipset_pdc202xx,
 		.init_hwif	= init_hwif_pdc202xx,
 		.init_dma	= init_dma_pdc202xx,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.extra		= 48,
@@ -485,7 +481,6 @@ static ide_pci_device_t pdc202xx_chipsets[] __devinitdata = {
 		.init_chipset	= init_chipset_pdc202xx,
 		.init_hwif	= init_hwif_pdc202xx,
 		.init_dma	= init_dma_pdc202xx,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.extra		= 48,
diff --git a/drivers/ide/pci/piix.c b/drivers/ide/pci/piix.c
index 1372c35be035..a4f88d25b16d 100644
--- a/drivers/ide/pci/piix.c
+++ b/drivers/ide/pci/piix.c
@@ -495,7 +495,6 @@ static void __devinit init_hwif_piix(ide_hwif_t *hwif)
 		.name		= name_str,		\
 		.init_chipset	= init_chipset_piix,	\
 		.init_hwif	= init_hwif_piix,	\
-		.channels	= 2,			\
 		.autodma	= AUTODMA,		\
 		.enablebits	= {{0x41,0x80,0x80}, {0x43,0x80,0x80}}, \
 		.bootable	= ON_BOARD,		\
@@ -514,11 +513,10 @@ static ide_pci_device_t piix_pci_info[] __devinitdata = {
 		 */
 		.name		= "MPIIX",
 		.init_hwif	= init_hwif_piix,
-		.channels	= 2,
 		.autodma	= NODMA,
 		.enablebits	= {{0x6d,0xc0,0x80}, {0x6d,0xc0,0xc0}},
 		.bootable	= ON_BOARD,
-		.flags		= IDEPCI_FLAG_ISA_PORTS
+		.host_flags	= IDE_HFLAG_ISA_PORTS,
 	},
 
 	/*  3 */ DECLARE_PIIX_DEV("PIIX3", 0x00),	/* no udma */
diff --git a/drivers/ide/pci/rz1000.c b/drivers/ide/pci/rz1000.c
index f8c954690142..10e1ae7a4a02 100644
--- a/drivers/ide/pci/rz1000.c
+++ b/drivers/ide/pci/rz1000.c
@@ -52,7 +52,6 @@ static void __devinit init_hwif_rz1000 (ide_hwif_t *hwif)
 static ide_pci_device_t rz1000_chipset __devinitdata = {
 	.name		= "RZ100x",
 	.init_hwif	= init_hwif_rz1000,
-	.channels	= 2,
 	.autodma	= NODMA,
 	.bootable	= ON_BOARD,
 };
diff --git a/drivers/ide/pci/sc1200.c b/drivers/ide/pci/sc1200.c
index 523363c93794..7dbc44cc722a 100644
--- a/drivers/ide/pci/sc1200.c
+++ b/drivers/ide/pci/sc1200.c
@@ -471,7 +471,6 @@ static void __devinit init_hwif_sc1200 (ide_hwif_t *hwif)
 static ide_pci_device_t sc1200_chipset __devinitdata = {
 	.name		= "SC1200",
 	.init_hwif	= init_hwif_sc1200,
-	.channels	= 2,
 	.autodma	= AUTODMA,
 	.bootable	= ON_BOARD,
 };
diff --git a/drivers/ide/pci/scc_pata.c b/drivers/ide/pci/scc_pata.c
index fd4b1a24ecab..a1954d1c2879 100644
--- a/drivers/ide/pci/scc_pata.c
+++ b/drivers/ide/pci/scc_pata.c
@@ -772,9 +772,9 @@ static void __devinit init_hwif_scc(ide_hwif_t *hwif)
       .init_setup	= init_setup_scc,		\
       .init_iops	= init_iops_scc,		\
       .init_hwif	= init_hwif_scc,		\
-      .channels	= 1,					\
       .autodma	= AUTODMA,				\
       .bootable	= ON_BOARD,				\
+      .host_flags	= IDE_HFLAG_SINGLE,		\
   }
 
 static ide_pci_device_t scc_chipsets[] __devinitdata = {
diff --git a/drivers/ide/pci/serverworks.c b/drivers/ide/pci/serverworks.c
index ed04e0c8dd4c..809deb92bc11 100644
--- a/drivers/ide/pci/serverworks.c
+++ b/drivers/ide/pci/serverworks.c
@@ -441,9 +441,12 @@ static int __devinit init_setup_csb6 (struct pci_dev *dev, ide_pci_device_t *d)
 			d->bootable = ON_BOARD;
 	}
 
-	d->channels = ((dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE ||
-			dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2) &&
-		       (!(PCI_FUNC(dev->devfn) & 1))) ? 1 : 2;
+	if ((dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE ||
+	    dev->device == PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2) &&
+	    (!(PCI_FUNC(dev->devfn) & 1)))
+		d->host_flags |= IDE_HFLAG_SINGLE;
+	else
+		d->host_flags &= ~IDE_HFLAG_SINGLE;
 
 	return ide_setup_pci_device(dev, d);
 }
@@ -454,7 +457,6 @@ static ide_pci_device_t serverworks_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_svwks,
 		.init_chipset	= init_chipset_svwks,
 		.init_hwif	= init_hwif_svwks,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= ON_BOARD,
 	},{	/* 1 */
@@ -462,7 +464,6 @@ static ide_pci_device_t serverworks_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_svwks,
 		.init_chipset	= init_chipset_svwks,
 		.init_hwif	= init_hwif_svwks,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= ON_BOARD,
 	},{	/* 2 */
@@ -470,7 +471,6 @@ static ide_pci_device_t serverworks_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_csb6,
 		.init_chipset	= init_chipset_svwks,
 		.init_hwif	= init_hwif_svwks,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.bootable	= ON_BOARD,
 	},{	/* 3 */
@@ -478,17 +478,17 @@ static ide_pci_device_t serverworks_chipsets[] __devinitdata = {
 		.init_setup	= init_setup_csb6,
 		.init_chipset	= init_chipset_svwks,
 		.init_hwif	= init_hwif_svwks,
-		.channels	= 1,	/* 2 */
 		.autodma	= AUTODMA,
 		.bootable	= ON_BOARD,
+		.host_flags	= IDE_HFLAG_SINGLE,
 	},{	/* 4 */
 		.name		= "SvrWks HT1000",
 		.init_setup	= init_setup_svwks,
 		.init_chipset	= init_chipset_svwks,
 		.init_hwif	= init_hwif_svwks,
-		.channels	= 1,	/* 2 */
 		.autodma	= AUTODMA,
 		.bootable	= ON_BOARD,
+		.host_flags	= IDE_HFLAG_SINGLE,
 	}
 };
 
diff --git a/drivers/ide/pci/sgiioc4.c b/drivers/ide/pci/sgiioc4.c
index d396b2929ed8..ed983b56b4e2 100644
--- a/drivers/ide/pci/sgiioc4.c
+++ b/drivers/ide/pci/sgiioc4.c
@@ -724,10 +724,10 @@ static ide_pci_device_t sgiioc4_chipset __devinitdata = {
 	 .name = "SGIIOC4",
 	 .init_hwif = ide_init_sgiioc4,
 	 .init_dma = ide_dma_sgiioc4,
-	 .channels = 1,
 	 .autodma = AUTODMA,
 	 /* SGI IOC4 doesn't have enablebits. */
 	 .bootable = ON_BOARD,
+	 .host_flags = IDE_HFLAG_SINGLE,
 };
 
 int
diff --git a/drivers/ide/pci/siimage.c b/drivers/ide/pci/siimage.c
index 5304fc5c616a..47b9b94afe05 100644
--- a/drivers/ide/pci/siimage.c
+++ b/drivers/ide/pci/siimage.c
@@ -957,7 +957,6 @@ static void __devinit init_hwif_siimage(ide_hwif_t *hwif)
 		.init_iops	= init_iops_siimage,	\
 		.init_hwif	= init_hwif_siimage,	\
 		.fixup		= siimage_fixup,	\
-		.channels	= 2,			\
 		.autodma	= AUTODMA,		\
 		.bootable	= ON_BOARD,		\
 	}
diff --git a/drivers/ide/pci/sis5513.c b/drivers/ide/pci/sis5513.c
index 756a9b6eb462..bbc502087578 100644
--- a/drivers/ide/pci/sis5513.c
+++ b/drivers/ide/pci/sis5513.c
@@ -878,7 +878,6 @@ static ide_pci_device_t sis5513_chipset __devinitdata = {
 	.name		= "SIS5513",
 	.init_chipset	= init_chipset_sis5513,
 	.init_hwif	= init_hwif_sis5513,
-	.channels	= 2,
 	.autodma	= NOAUTODMA,
 	.enablebits	= {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
 	.bootable	= ON_BOARD,
diff --git a/drivers/ide/pci/sl82c105.c b/drivers/ide/pci/sl82c105.c
index f2156ee7e2bd..f4637d29aaf5 100644
--- a/drivers/ide/pci/sl82c105.c
+++ b/drivers/ide/pci/sl82c105.c
@@ -453,7 +453,6 @@ static ide_pci_device_t sl82c105_chipset __devinitdata = {
 	.name		= "W82C105",
 	.init_chipset	= init_chipset_sl82c105,
 	.init_hwif	= init_hwif_sl82c105,
-	.channels	= 2,
 	.autodma	= NOAUTODMA,
 	.enablebits	= {{0x40,0x01,0x01}, {0x40,0x10,0x10}},
 	.bootable	= ON_BOARD,
diff --git a/drivers/ide/pci/slc90e66.c b/drivers/ide/pci/slc90e66.c
index 575dbbd8b482..115bcaefc8ee 100644
--- a/drivers/ide/pci/slc90e66.c
+++ b/drivers/ide/pci/slc90e66.c
@@ -214,7 +214,6 @@ static void __devinit init_hwif_slc90e66 (ide_hwif_t *hwif)
 static ide_pci_device_t slc90e66_chipset __devinitdata = {
 	.name		= "SLC90E66",
 	.init_hwif	= init_hwif_slc90e66,
-	.channels	= 2,
 	.autodma	= AUTODMA,
 	.enablebits	= {{0x41,0x80,0x80}, {0x43,0x80,0x80}},
 	.bootable	= ON_BOARD,
diff --git a/drivers/ide/pci/tc86c001.c b/drivers/ide/pci/tc86c001.c
index 8de1f8e22494..1d40b0820f50 100644
--- a/drivers/ide/pci/tc86c001.c
+++ b/drivers/ide/pci/tc86c001.c
@@ -248,9 +248,9 @@ static ide_pci_device_t tc86c001_chipset __devinitdata = {
 	.name		= "TC86C001",
 	.init_chipset	= init_chipset_tc86c001,
 	.init_hwif	= init_hwif_tc86c001,
-	.channels	= 1,
 	.autodma	= AUTODMA,
-	.bootable	= OFF_BOARD
+	.bootable	= OFF_BOARD,
+	.host_flags	= IDE_HFLAG_SINGLE,
 };
 
 static int __devinit tc86c001_init_one(struct pci_dev *dev,
diff --git a/drivers/ide/pci/triflex.c b/drivers/ide/pci/triflex.c
index 35e8c612638f..801314fe3524 100644
--- a/drivers/ide/pci/triflex.c
+++ b/drivers/ide/pci/triflex.c
@@ -129,7 +129,6 @@ static void __devinit init_hwif_triflex(ide_hwif_t *hwif)
 static ide_pci_device_t triflex_device __devinitdata = {
 	.name		= "TRIFLEX",
 	.init_hwif	= init_hwif_triflex,
-	.channels	= 2,
 	.autodma	= AUTODMA,
 	.enablebits	= {{0x80, 0x01, 0x01}, {0x80, 0x02, 0x02}},
 	.bootable	= ON_BOARD,
diff --git a/drivers/ide/pci/trm290.c b/drivers/ide/pci/trm290.c
index cbb1b11119a5..dc4f4e298e00 100644
--- a/drivers/ide/pci/trm290.c
+++ b/drivers/ide/pci/trm290.c
@@ -327,7 +327,6 @@ static void __devinit init_hwif_trm290(ide_hwif_t *hwif)
 static ide_pci_device_t trm290_chipset __devinitdata = {
 	.name		= "TRM290",
 	.init_hwif	= init_hwif_trm290,
-	.channels	= 2,
 	.autodma	= NOAUTODMA,
 	.bootable	= ON_BOARD,
 };
diff --git a/drivers/ide/pci/via82cxxx.c b/drivers/ide/pci/via82cxxx.c
index 27e92fb9f95e..bfc9b67f8c92 100644
--- a/drivers/ide/pci/via82cxxx.c
+++ b/drivers/ide/pci/via82cxxx.c
@@ -498,7 +498,6 @@ static ide_pci_device_t via82cxxx_chipsets[] __devinitdata = {
 		.name		= "VP_IDE",
 		.init_chipset	= init_chipset_via82cxxx,
 		.init_hwif	= init_hwif_via82cxxx,
-		.channels	= 2,
 		.autodma	= NOAUTODMA,
 		.enablebits	= {{0x40,0x02,0x02}, {0x40,0x01,0x01}},
 		.bootable	= ON_BOARD
@@ -506,7 +505,6 @@ static ide_pci_device_t via82cxxx_chipsets[] __devinitdata = {
 		.name		= "VP_IDE",
 		.init_chipset	= init_chipset_via82cxxx,
 		.init_hwif	= init_hwif_via82cxxx,
-		.channels	= 2,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		.bootable	= ON_BOARD,
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index c88d33225cf9..bfe1f4e59597 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -407,7 +407,7 @@ static ide_hwif_t *ide_hwif_configure(struct pci_dev *dev, ide_pci_device_t *d,
 	unsigned long ctl = 0, base = 0;
 	ide_hwif_t *hwif;
 
-	if ((d->flags & IDEPCI_FLAG_ISA_PORTS) == 0) {
+	if ((d->host_flags & IDE_HFLAG_ISA_PORTS) == 0) {
 		/*  Possibly we should fail if these checks report true */
 		ide_pci_check_iomem(dev, d, 2*port);
 		ide_pci_check_iomem(dev, d, 2*port+1);
@@ -571,7 +571,7 @@ out:
  
 void ide_pci_setup_ports(struct pci_dev *dev, ide_pci_device_t *d, int pciirq, ata_index_t *index)
 {
-	int port;
+	int channels = (d->host_flags & IDE_HFLAG_SINGLE) ? 1 : 2, port;
 	int at_least_one_hwif_enabled = 0;
 	ide_hwif_t *hwif, *mate = NULL;
 	u8 tmp;
@@ -582,16 +582,13 @@ void ide_pci_setup_ports(struct pci_dev *dev, ide_pci_device_t *d, int pciirq, a
 	 * Set up the IDE ports
 	 */
 	 
-	for (port = 0; port <= 1; ++port) {
+	for (port = 0; port < channels; ++port) {
 		ide_pci_enablebit_t *e = &(d->enablebits[port]);
 	
 		if (e->reg && (pci_read_config_byte(dev, e->reg, &tmp) ||
 		    (tmp & e->mask) != e->val))
 			continue;	/* port not enabled */
 
-		if (d->channels	<= port)
-			break;
-	
 		if ((hwif = ide_hwif_configure(dev, d, mate, port, pciirq)) == NULL)
 			continue;
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 349c22a1fbc5..498dc57627fa 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1244,7 +1244,8 @@ typedef struct ide_pci_enablebit_s {
 
 enum {
 	/* Uses ISA control ports not PCI ones. */
-	IDEPCI_FLAG_ISA_PORTS		= (1 << 0),
+	IDE_HFLAG_ISA_PORTS		= (1 << 0),
+	IDE_HFLAG_SINGLE		= (1 << 1),
 };
 
 typedef struct ide_pci_device_s {
@@ -1256,13 +1257,12 @@ typedef struct ide_pci_device_s {
 	void                    (*init_hwif)(ide_hwif_t *);
 	void			(*init_dma)(ide_hwif_t *, unsigned long);
 	void			(*fixup)(ide_hwif_t *);
-	u8			channels;
 	u8			autodma;
 	ide_pci_enablebit_t	enablebits[2];
 	u8			bootable;
 	unsigned int		extra;
 	struct ide_pci_device_s	*next;
-	u8			flags;
+	u8			host_flags;
 	u8			udma_mask;
 } ide_pci_device_t;
 
-- 
cgit v1.3-8-gc7d7


From 7dd00083b1160b560fa2a0a486799b57baa5d035 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 20 Jul 2007 01:11:56 +0200
Subject: ide: add ide_pio_cycle_time() helper (take 2)

* Add ide_pio_cycle_time() helper.

* Use it in ali14xx/ht6560b/qd65xx/cmd64{0,x}/sl82c105 and pmac host drivers
  (previously cycle time given by the device was only used for "pio" == 255).

* Remove no longer needed ide_pio_data_t.cycle_time field.

v2:
* Fix "ata_" prefix (Noticed by Jeff).

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-lib.c        | 40 ++++++++++++++++++++++++++++------------
 drivers/ide/legacy/ali14xx.c |  5 ++---
 drivers/ide/legacy/ht6560b.c | 10 ++++++----
 drivers/ide/legacy/qd65xx.c  | 20 ++++++++++----------
 drivers/ide/pci/cmd640.c     | 12 +++++-------
 drivers/ide/pci/cmd64x.c     | 10 ++++++----
 drivers/ide/pci/sl82c105.c   | 16 ++++++++--------
 drivers/ide/ppc/pmac.c       | 15 ++++++++-------
 include/linux/ide.h          |  2 +-
 9 files changed, 74 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index d5cc96bb4298..6e85bee0bef7 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -249,6 +249,29 @@ static int ide_scan_pio_blacklist (char *model)
 	return -1;
 }
 
+unsigned int ide_pio_cycle_time(ide_drive_t *drive, u8 pio)
+{
+	struct hd_driveid *id = drive->id;
+	int cycle_time = 0;
+
+	if (id->field_valid & 2) {
+		if (id->capability & 8)
+			cycle_time = id->eide_pio_iordy;
+		else
+			cycle_time = id->eide_pio;
+	}
+
+	/* conservative "downgrade" for all pre-ATA2 drives */
+	if (pio < 3) {
+		if (cycle_time && cycle_time < ide_pio_timings[pio].cycle_time)
+			cycle_time = 0; /* use standard timing */
+	}
+
+	return cycle_time ? cycle_time : ide_pio_timings[pio].cycle_time;
+}
+
+EXPORT_SYMBOL_GPL(ide_pio_cycle_time);
+
 /**
  *	ide_get_best_pio_mode	-	get PIO mode from drive
  *	@drive: drive to consider
@@ -266,7 +289,6 @@ static int ide_scan_pio_blacklist (char *model)
 u8 ide_get_best_pio_mode (ide_drive_t *drive, u8 mode_wanted, u8 max_mode, ide_pio_data_t *d)
 {
 	int pio_mode;
-	int cycle_time = 0;
 	struct hd_driveid* id = drive->id;
 	int overridden  = 0;
 
@@ -284,7 +306,6 @@ u8 ide_get_best_pio_mode (ide_drive_t *drive, u8 mode_wanted, u8 max_mode, ide_p
 		}
 		if (id->field_valid & 2) {	  /* drive implements ATA2? */
 			if (id->capability & 8) { /* IORDY supported? */
-				cycle_time = id->eide_pio_iordy;
 				if (id->eide_pio_modes & 7) {
 					overridden = 0;
 					if (id->eide_pio_modes & 4)
@@ -294,8 +315,6 @@ u8 ide_get_best_pio_mode (ide_drive_t *drive, u8 mode_wanted, u8 max_mode, ide_p
 					else
 						pio_mode = 3;
 				}
-			} else {
-				cycle_time = id->eide_pio;
 			}
 		}
 
@@ -310,18 +329,15 @@ u8 ide_get_best_pio_mode (ide_drive_t *drive, u8 mode_wanted, u8 max_mode, ide_p
 			pio_mode--;
 			printk(KERN_INFO "%s: applying conservative "
 					 "PIO \"downgrade\"\n", drive->name);
-			if (cycle_time && cycle_time < ide_pio_timings[pio_mode].cycle_time)
-				cycle_time = 0; /* use standard timing */
 		}
 	}
-	if (pio_mode > max_mode) {
+
+	if (pio_mode > max_mode)
 		pio_mode = max_mode;
-		cycle_time = 0;
-	}
-	if (d) {
+
+	if (d)
 		d->pio_mode = pio_mode;
-		d->cycle_time = cycle_time ? cycle_time : ide_pio_timings[pio_mode].cycle_time;
-	}
+
 	return pio_mode;
 }
 
diff --git a/drivers/ide/legacy/ali14xx.c b/drivers/ide/legacy/ali14xx.c
index df17ed68c0bc..30aeb8750c00 100644
--- a/drivers/ide/legacy/ali14xx.c
+++ b/drivers/ide/legacy/ali14xx.c
@@ -115,13 +115,12 @@ static void ali14xx_tune_drive (ide_drive_t *drive, u8 pio)
 	int time1, time2;
 	u8 param1, param2, param3, param4;
 	unsigned long flags;
-	ide_pio_data_t d;
 	int bus_speed = system_bus_clock();
 
-	pio = ide_get_best_pio_mode(drive, pio, ALI_MAX_PIO, &d);
+	pio = ide_get_best_pio_mode(drive, pio, ALI_MAX_PIO, NULL);
 
 	/* calculate timing, according to PIO mode */
-	time1 = d.cycle_time;
+	time1 = ide_pio_cycle_time(drive, pio);
 	time2 = ide_pio_timings[pio].active_time;
 	param3 = param1 = (time2 * bus_speed + 999) / 1000;
 	param4 = param2 = (time1 * bus_speed + 999) / 1000 - param1;
diff --git a/drivers/ide/legacy/ht6560b.c b/drivers/ide/legacy/ht6560b.c
index c8f353b1296f..85d16812d902 100644
--- a/drivers/ide/legacy/ht6560b.c
+++ b/drivers/ide/legacy/ht6560b.c
@@ -203,19 +203,21 @@ static u8 ht_pio2timings(ide_drive_t *drive, u8 pio)
 {
 	int active_time, recovery_time;
 	int active_cycles, recovery_cycles;
-	ide_pio_data_t d;
 	int bus_speed = system_bus_clock();
 	
         if (pio) {
-		pio = ide_get_best_pio_mode(drive, pio, 5, &d);
-		
+		unsigned int cycle_time;
+
+		pio = ide_get_best_pio_mode(drive, pio, 5, NULL);
+		cycle_time = ide_pio_cycle_time(drive, pio);
+
 		/*
 		 *  Just like opti621.c we try to calculate the
 		 *  actual cycle time for recovery and activity
 		 *  according system bus speed.
 		 */
 		active_time = ide_pio_timings[pio].active_time;
-		recovery_time = d.cycle_time 
+		recovery_time = cycle_time
 			- active_time
 			- ide_pio_timings[pio].setup_time;
 		/*
diff --git a/drivers/ide/legacy/qd65xx.c b/drivers/ide/legacy/qd65xx.c
index 7783745dd167..233905b08e93 100644
--- a/drivers/ide/legacy/qd65xx.c
+++ b/drivers/ide/legacy/qd65xx.c
@@ -252,38 +252,38 @@ static void qd6500_tune_drive (ide_drive_t *drive, u8 pio)
 
 static void qd6580_tune_drive (ide_drive_t *drive, u8 pio)
 {
-	ide_pio_data_t d;
 	int base = HWIF(drive)->select_data;
+	unsigned int cycle_time;
 	int active_time   = 175;
 	int recovery_time = 415; /* worst case values from the dos driver */
 
 	if (drive->id && !qd_find_disk_type(drive, &active_time, &recovery_time)) {
-		pio = ide_get_best_pio_mode(drive, pio, 4, &d);
+		pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+		cycle_time = ide_pio_cycle_time(drive, pio);
 
 		switch (pio) {
 			case 0: break;
 			case 3:
-				if (d.cycle_time >= 110) {
+				if (cycle_time >= 110) {
 					active_time = 86;
-					recovery_time = d.cycle_time - 102;
+					recovery_time = cycle_time - 102;
 				} else
 					printk(KERN_WARNING "%s: Strange recovery time !\n",drive->name);
 				break;
 			case 4:
-				if (d.cycle_time >= 69) {
+				if (cycle_time >= 69) {
 					active_time = 70;
-					recovery_time = d.cycle_time - 61;
+					recovery_time = cycle_time - 61;
 				} else
 					printk(KERN_WARNING "%s: Strange recovery time !\n",drive->name);
 				break;
 			default:
-				if (d.cycle_time >= 180) {
+				if (cycle_time >= 180) {
 					active_time = 110;
-					recovery_time = d.cycle_time - 120;
+					recovery_time = cycle_time - 120;
 				} else {
 					active_time = ide_pio_timings[pio].active_time;
-					recovery_time = d.cycle_time
-							-active_time;
+					recovery_time = cycle_time - active_time;
 				}
 		}
 		printk(KERN_INFO "%s: PIO mode%d\n", drive->name,pio);
diff --git a/drivers/ide/pci/cmd640.c b/drivers/ide/pci/cmd640.c
index dff21597e643..9f67ffc43421 100644
--- a/drivers/ide/pci/cmd640.c
+++ b/drivers/ide/pci/cmd640.c
@@ -633,9 +633,8 @@ static void cmd640_set_mode (unsigned int index, u8 pio_mode, unsigned int cycle
  */
 static void cmd640_tune_drive (ide_drive_t *drive, u8 mode_wanted)
 {
+	unsigned int index = 0, cycle_time;
 	u8 b;
-	ide_pio_data_t  d;
-	unsigned int index = 0;
 
 	while (drive != cmd_drives[index]) {
 		if (++index > 3) {
@@ -662,13 +661,12 @@ static void cmd640_tune_drive (ide_drive_t *drive, u8 mode_wanted)
 			return;
 	}
 
-	(void) ide_get_best_pio_mode (drive, mode_wanted, 5, &d);
-	cmd640_set_mode (index, d.pio_mode, d.cycle_time);
+	mode_wanted = ide_get_best_pio_mode(drive, mode_wanted, 5, NULL);
+	cycle_time = ide_pio_cycle_time(drive, mode_wanted);
+	cmd640_set_mode(index, mode_wanted, cycle_time);
 
 	printk("%s: selected cmd640 PIO mode%d (%dns)",
-		drive->name,
-		d.pio_mode,
-		d.cycle_time);
+		drive->name, mode_wanted, cycle_time);
 
 	display_clocks(index);
 }
diff --git a/drivers/ide/pci/cmd64x.c b/drivers/ide/pci/cmd64x.c
index 8150a023dd7a..c383f6dc67dd 100644
--- a/drivers/ide/pci/cmd64x.c
+++ b/drivers/ide/pci/cmd64x.c
@@ -221,16 +221,18 @@ static u8 cmd64x_tune_pio (ide_drive_t *drive, u8 mode_wanted)
 {
 	ide_hwif_t *hwif	= HWIF(drive);
 	struct pci_dev *dev	= hwif->pci_dev;
-	ide_pio_data_t pio;
+	unsigned int cycle_time;
 	u8 pio_mode, setup_count, arttim = 0;
 	static const u8 setup_values[] = {0x40, 0x40, 0x40, 0x80, 0, 0xc0};
 	static const u8 arttim_regs[4] = {ARTTIM0, ARTTIM1, ARTTIM23, ARTTIM23};
-	pio_mode = ide_get_best_pio_mode(drive, mode_wanted, 5, &pio);
+
+	pio_mode = ide_get_best_pio_mode(drive, mode_wanted, 5, NULL);
+	cycle_time = ide_pio_cycle_time(drive, pio_mode);
 
 	cmdprintk("%s: PIO mode wanted %d, selected %d (%d ns)\n",
-		  drive->name, mode_wanted, pio_mode, pio.cycle_time);
+		  drive->name, mode_wanted, pio_mode, cycle_time);
 
-	program_cycle_times(drive, pio.cycle_time,
+	program_cycle_times(drive, cycle_time,
 			    ide_pio_timings[pio_mode].active_time);
 
 	setup_count = quantize_timing(ide_pio_timings[pio_mode].setup_time,
diff --git a/drivers/ide/pci/sl82c105.c b/drivers/ide/pci/sl82c105.c
index f4637d29aaf5..cc5f69b7514f 100644
--- a/drivers/ide/pci/sl82c105.c
+++ b/drivers/ide/pci/sl82c105.c
@@ -52,13 +52,13 @@
  * Convert a PIO mode and cycle time to the required on/off times
  * for the interface.  This has protection against runaway timings.
  */
-static unsigned int get_pio_timings(ide_drive_t *drive, ide_pio_data_t *p)
+static unsigned int get_pio_timings(ide_drive_t *drive, u8 pio)
 {
 	unsigned int cmd_on, cmd_off;
 	u8 iordy = 0;
 
-	cmd_on  = (ide_pio_timings[p->pio_mode].active_time + 29) / 30;
-	cmd_off = (p->cycle_time - 30 * cmd_on + 29) / 30;
+	cmd_on  = (ide_pio_timings[pio].active_time + 29) / 30;
+	cmd_off = (ide_pio_cycle_time(drive, pio) - 30 * cmd_on + 29) / 30;
 
 	if (cmd_on == 0)
 		cmd_on = 1;
@@ -66,7 +66,7 @@ static unsigned int get_pio_timings(ide_drive_t *drive, ide_pio_data_t *p)
 	if (cmd_off == 0)
 		cmd_off = 1;
 
-	if (p->pio_mode > 2 || ide_dev_has_iordy(drive->id))
+	if (pio > 2 || ide_dev_has_iordy(drive->id))
 		iordy = 0x40;
 
 	return (cmd_on - 1) << 8 | (cmd_off - 1) | iordy;
@@ -79,14 +79,13 @@ static u8 sl82c105_tune_pio(ide_drive_t *drive, u8 pio)
 {
 	struct pci_dev *dev	= HWIF(drive)->pci_dev;
 	int reg			= 0x44 + drive->dn * 4;
-	ide_pio_data_t p;
 	u16 drv_ctrl;
 
 	DBG(("sl82c105_tune_pio(drive:%s, pio:%u)\n", drive->name, pio));
 
-	pio = ide_get_best_pio_mode(drive, pio, 5, &p);
+	pio = ide_get_best_pio_mode(drive, pio, 5, NULL);
 
-	drv_ctrl = get_pio_timings(drive, &p);
+	drv_ctrl = get_pio_timings(drive, pio);
 
 	/*
 	 * Store the PIO timings so that we can restore them
@@ -105,7 +104,8 @@ static u8 sl82c105_tune_pio(ide_drive_t *drive, u8 pio)
 	}
 
 	printk(KERN_DEBUG "%s: selected %s (%dns) (%04X)\n", drive->name,
-	       ide_xfer_verbose(pio + XFER_PIO_0), p.cycle_time, drv_ctrl);
+			  ide_xfer_verbose(pio + XFER_PIO_0),
+			  ide_pio_cycle_time(drive, pio), drv_ctrl);
 
 	return pio;
 }
diff --git a/drivers/ide/ppc/pmac.c b/drivers/ide/ppc/pmac.c
index e46f47206542..669330521fc1 100644
--- a/drivers/ide/ppc/pmac.c
+++ b/drivers/ide/ppc/pmac.c
@@ -615,24 +615,25 @@ out:
 static void
 pmac_ide_tuneproc(ide_drive_t *drive, u8 pio)
 {
-	ide_pio_data_t d;
 	u32 *timings;
 	unsigned accessTicks, recTicks;
 	unsigned accessTime, recTime;
 	pmac_ide_hwif_t* pmif = (pmac_ide_hwif_t *)HWIF(drive)->hwif_data;
-	
+	unsigned int cycle_time;
+
 	if (pmif == NULL)
 		return;
 		
 	/* which drive is it ? */
 	timings = &pmif->timings[drive->select.b.unit & 0x01];
 
-	pio = ide_get_best_pio_mode(drive, pio, 4, &d);
+	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	cycle_time = ide_pio_cycle_time(drive, pio);
 
 	switch (pmif->kind) {
 	case controller_sh_ata6: {
 		/* 133Mhz cell */
-		u32 tr = kauai_lookup_timing(shasta_pio_timings, d.cycle_time);
+		u32 tr = kauai_lookup_timing(shasta_pio_timings, cycle_time);
 		if (tr == 0)
 			return;
 		*timings = ((*timings) & ~TR_133_PIOREG_PIO_MASK) | tr;
@@ -641,7 +642,7 @@ pmac_ide_tuneproc(ide_drive_t *drive, u8 pio)
 	case controller_un_ata6:
 	case controller_k2_ata6: {
 		/* 100Mhz cell */
-		u32 tr = kauai_lookup_timing(kauai_pio_timings, d.cycle_time);
+		u32 tr = kauai_lookup_timing(kauai_pio_timings, cycle_time);
 		if (tr == 0)
 			return;
 		*timings = ((*timings) & ~TR_100_PIOREG_PIO_MASK) | tr;
@@ -649,7 +650,7 @@ pmac_ide_tuneproc(ide_drive_t *drive, u8 pio)
 		}
 	case controller_kl_ata4:
 		/* 66Mhz cell */
-		recTime = d.cycle_time - ide_pio_timings[pio].active_time
+		recTime = cycle_time - ide_pio_timings[pio].active_time
 				- ide_pio_timings[pio].setup_time;
 		recTime = max(recTime, 150U);
 		accessTime = ide_pio_timings[pio].active_time;
@@ -665,7 +666,7 @@ pmac_ide_tuneproc(ide_drive_t *drive, u8 pio)
 	default: {
 		/* 33Mhz cell */
 		int ebit = 0;
-		recTime = d.cycle_time - ide_pio_timings[pio].active_time
+		recTime = cycle_time - ide_pio_timings[pio].active_time
 				- ide_pio_timings[pio].setup_time;
 		recTime = max(recTime, 150U);
 		accessTime = ide_pio_timings[pio].active_time;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 498dc57627fa..0afa52c14ffa 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1379,9 +1379,9 @@ typedef struct ide_pio_timings_s {
 
 typedef struct ide_pio_data_s {
 	u8 pio_mode;
-	unsigned int cycle_time;
 } ide_pio_data_t;
 
+unsigned int ide_pio_cycle_time(ide_drive_t *, u8);
 extern u8 ide_get_best_pio_mode (ide_drive_t *drive, u8 mode_wanted, u8 max_mode, ide_pio_data_t *d);
 extern const ide_pio_timings_t ide_pio_timings[6];
 
-- 
cgit v1.3-8-gc7d7


From 2134758d2a5429325cee4d4ce8959af5314eeba1 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 20 Jul 2007 01:11:58 +0200
Subject: ide: drop "PIO data" argument from ide_get_best_pio_mode()

* Drop no longer needed "PIO data" argument from ide_get_best_pio_mode()
  and convert all users accordingly.

* Remove no longer needed ide_pio_data_t.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-lib.c          | 6 +-----
 drivers/ide/legacy/ali14xx.c   | 2 +-
 drivers/ide/legacy/dtc2278.c   | 2 +-
 drivers/ide/legacy/ht6560b.c   | 2 +-
 drivers/ide/legacy/qd65xx.c    | 2 +-
 drivers/ide/legacy/umc8672.c   | 2 +-
 drivers/ide/mips/au1xxx-ide.c  | 2 +-
 drivers/ide/pci/aec62xx.c      | 2 +-
 drivers/ide/pci/alim15x3.c     | 3 +--
 drivers/ide/pci/atiixp.c       | 2 +-
 drivers/ide/pci/cmd640.c       | 2 +-
 drivers/ide/pci/cmd64x.c       | 2 +-
 drivers/ide/pci/cs5520.c       | 2 +-
 drivers/ide/pci/cs5530.c       | 2 +-
 drivers/ide/pci/cs5535.c       | 6 +++---
 drivers/ide/pci/cy82c693.c     | 2 +-
 drivers/ide/pci/hpt34x.c       | 2 +-
 drivers/ide/pci/hpt366.c       | 2 +-
 drivers/ide/pci/it8213.c       | 4 ++--
 drivers/ide/pci/it821x.c       | 4 ++--
 drivers/ide/pci/jmicron.c      | 2 +-
 drivers/ide/pci/opti621.c      | 4 ++--
 drivers/ide/pci/pdc202xx_new.c | 2 +-
 drivers/ide/pci/pdc202xx_old.c | 2 +-
 drivers/ide/pci/piix.c         | 2 +-
 drivers/ide/pci/sc1200.c       | 2 +-
 drivers/ide/pci/scc_pata.c     | 2 +-
 drivers/ide/pci/serverworks.c  | 2 +-
 drivers/ide/pci/siimage.c      | 4 ++--
 drivers/ide/pci/sis5513.c      | 2 +-
 drivers/ide/pci/sl82c105.c     | 2 +-
 drivers/ide/pci/slc90e66.c     | 2 +-
 drivers/ide/pci/tc86c001.c     | 2 +-
 drivers/ide/pci/triflex.c      | 2 +-
 drivers/ide/ppc/mpc8xx.c       | 3 +--
 drivers/ide/ppc/pmac.c         | 2 +-
 include/linux/ide.h            | 6 +-----
 37 files changed, 43 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index 6e85bee0bef7..df29868209f3 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -277,7 +277,6 @@ EXPORT_SYMBOL_GPL(ide_pio_cycle_time);
  *	@drive: drive to consider
  *	@mode_wanted: preferred mode
  *	@max_mode: highest allowed mode
- *	@d: PIO data
  *
  *	This routine returns the recommended PIO settings for a given drive,
  *	based on the drive->id information and the ide_pio_blacklist[].
@@ -286,7 +285,7 @@ EXPORT_SYMBOL_GPL(ide_pio_cycle_time);
  *	This is used by most chipset support modules when "auto-tuning".
  */
 
-u8 ide_get_best_pio_mode (ide_drive_t *drive, u8 mode_wanted, u8 max_mode, ide_pio_data_t *d)
+u8 ide_get_best_pio_mode (ide_drive_t *drive, u8 mode_wanted, u8 max_mode)
 {
 	int pio_mode;
 	struct hd_driveid* id = drive->id;
@@ -335,9 +334,6 @@ u8 ide_get_best_pio_mode (ide_drive_t *drive, u8 mode_wanted, u8 max_mode, ide_p
 	if (pio_mode > max_mode)
 		pio_mode = max_mode;
 
-	if (d)
-		d->pio_mode = pio_mode;
-
 	return pio_mode;
 }
 
diff --git a/drivers/ide/legacy/ali14xx.c b/drivers/ide/legacy/ali14xx.c
index 30aeb8750c00..d5c7a57b71c1 100644
--- a/drivers/ide/legacy/ali14xx.c
+++ b/drivers/ide/legacy/ali14xx.c
@@ -117,7 +117,7 @@ static void ali14xx_tune_drive (ide_drive_t *drive, u8 pio)
 	unsigned long flags;
 	int bus_speed = system_bus_clock();
 
-	pio = ide_get_best_pio_mode(drive, pio, ALI_MAX_PIO, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, ALI_MAX_PIO);
 
 	/* calculate timing, according to PIO mode */
 	time1 = ide_pio_cycle_time(drive, pio);
diff --git a/drivers/ide/legacy/dtc2278.c b/drivers/ide/legacy/dtc2278.c
index 36a3f0ac6162..8c4c27e5dc10 100644
--- a/drivers/ide/legacy/dtc2278.c
+++ b/drivers/ide/legacy/dtc2278.c
@@ -71,7 +71,7 @@ static void tune_dtc2278 (ide_drive_t *drive, u8 pio)
 {
 	unsigned long flags;
 
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 
 	if (pio >= 3) {
 		spin_lock_irqsave(&ide_lock, flags);
diff --git a/drivers/ide/legacy/ht6560b.c b/drivers/ide/legacy/ht6560b.c
index 85d16812d902..82ed37df9566 100644
--- a/drivers/ide/legacy/ht6560b.c
+++ b/drivers/ide/legacy/ht6560b.c
@@ -208,7 +208,7 @@ static u8 ht_pio2timings(ide_drive_t *drive, u8 pio)
         if (pio) {
 		unsigned int cycle_time;
 
-		pio = ide_get_best_pio_mode(drive, pio, 5, NULL);
+		pio = ide_get_best_pio_mode(drive, pio, 5);
 		cycle_time = ide_pio_cycle_time(drive, pio);
 
 		/*
diff --git a/drivers/ide/legacy/qd65xx.c b/drivers/ide/legacy/qd65xx.c
index 233905b08e93..39145102b348 100644
--- a/drivers/ide/legacy/qd65xx.c
+++ b/drivers/ide/legacy/qd65xx.c
@@ -258,7 +258,7 @@ static void qd6580_tune_drive (ide_drive_t *drive, u8 pio)
 	int recovery_time = 415; /* worst case values from the dos driver */
 
 	if (drive->id && !qd_find_disk_type(drive, &active_time, &recovery_time)) {
-		pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+		pio = ide_get_best_pio_mode(drive, pio, 4);
 		cycle_time = ide_pio_cycle_time(drive, pio);
 
 		switch (pio) {
diff --git a/drivers/ide/legacy/umc8672.c b/drivers/ide/legacy/umc8672.c
index ddc403a0bd82..caeebd41081f 100644
--- a/drivers/ide/legacy/umc8672.c
+++ b/drivers/ide/legacy/umc8672.c
@@ -110,7 +110,7 @@ static void tune_umc (ide_drive_t *drive, u8 pio)
 	unsigned long flags;
 	ide_hwgroup_t *hwgroup = ide_hwifs[HWIF(drive)->index^1].hwgroup;
 
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 	printk("%s: setting umc8672 to PIO mode%d (speed %d)\n",
 		drive->name, pio, pio_to_umc[pio]);
 	spin_lock_irqsave(&ide_lock, flags);
diff --git a/drivers/ide/mips/au1xxx-ide.c b/drivers/ide/mips/au1xxx-ide.c
index 2e7013a2a7f6..b0d13c34d310 100644
--- a/drivers/ide/mips/au1xxx-ide.c
+++ b/drivers/ide/mips/au1xxx-ide.c
@@ -106,7 +106,7 @@ static void auide_tune_drive(ide_drive_t *drive, byte pio)
 	u8 speed;
 
 	/* get the best pio mode for the drive */
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 
 	printk(KERN_INFO "%s: setting Au1XXX IDE to PIO mode%d\n",
 	       drive->name, pio);
diff --git a/drivers/ide/pci/aec62xx.c b/drivers/ide/pci/aec62xx.c
index f8ac91c22e64..8396d711f232 100644
--- a/drivers/ide/pci/aec62xx.c
+++ b/drivers/ide/pci/aec62xx.c
@@ -142,7 +142,7 @@ static int aec6260_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 
 static void aec62xx_tune_drive (ide_drive_t *drive, u8 pio)
 {
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 	(void) HWIF(drive)->speedproc(drive, pio + XFER_PIO_0);
 }
 
diff --git a/drivers/ide/pci/alim15x3.c b/drivers/ide/pci/alim15x3.c
index 1012da6e8a42..a0c43a91b0f3 100644
--- a/drivers/ide/pci/alim15x3.c
+++ b/drivers/ide/pci/alim15x3.c
@@ -295,7 +295,6 @@ static int ali_get_info (char *buffer, char **addr, off_t offset, int count)
  
 static u8 ali15x3_tune_pio (ide_drive_t *drive, u8 pio)
 {
-	ide_pio_data_t d;
 	ide_hwif_t *hwif = HWIF(drive);
 	struct pci_dev *dev = hwif->pci_dev;
 	int s_time, a_time, c_time;
@@ -307,7 +306,7 @@ static u8 ali15x3_tune_pio (ide_drive_t *drive, u8 pio)
 	u8 cd_dma_fifo = 0;
 	int unit = drive->select.b.unit & 1;
 
-	pio = ide_get_best_pio_mode(drive, pio, 5, &d);
+	pio = ide_get_best_pio_mode(drive, pio, 5);
 	s_time = ide_pio_timings[pio].setup_time;
 	a_time = ide_pio_timings[pio].active_time;
 	if ((s_clc = (s_time * bus_speed + 999) / 1000) >= 8)
diff --git a/drivers/ide/pci/atiixp.c b/drivers/ide/pci/atiixp.c
index 078adbe250d2..a4740e4776fd 100644
--- a/drivers/ide/pci/atiixp.c
+++ b/drivers/ide/pci/atiixp.c
@@ -155,7 +155,7 @@ static void atiixp_tune_pio(ide_drive_t *drive, u8 pio)
 
 static void atiixp_tuneproc(ide_drive_t *drive, u8 pio)
 {
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 	atiixp_tune_pio(drive, pio);
 	(void)ide_config_drive_speed(drive, XFER_PIO_0 + pio);
 }
diff --git a/drivers/ide/pci/cmd640.c b/drivers/ide/pci/cmd640.c
index 9f67ffc43421..335482981a92 100644
--- a/drivers/ide/pci/cmd640.c
+++ b/drivers/ide/pci/cmd640.c
@@ -661,7 +661,7 @@ static void cmd640_tune_drive (ide_drive_t *drive, u8 mode_wanted)
 			return;
 	}
 
-	mode_wanted = ide_get_best_pio_mode(drive, mode_wanted, 5, NULL);
+	mode_wanted = ide_get_best_pio_mode(drive, mode_wanted, 5);
 	cycle_time = ide_pio_cycle_time(drive, mode_wanted);
 	cmd640_set_mode(index, mode_wanted, cycle_time);
 
diff --git a/drivers/ide/pci/cmd64x.c b/drivers/ide/pci/cmd64x.c
index c383f6dc67dd..9cf969b61bf2 100644
--- a/drivers/ide/pci/cmd64x.c
+++ b/drivers/ide/pci/cmd64x.c
@@ -226,7 +226,7 @@ static u8 cmd64x_tune_pio (ide_drive_t *drive, u8 mode_wanted)
 	static const u8 setup_values[] = {0x40, 0x40, 0x40, 0x80, 0, 0xc0};
 	static const u8 arttim_regs[4] = {ARTTIM0, ARTTIM1, ARTTIM23, ARTTIM23};
 
-	pio_mode = ide_get_best_pio_mode(drive, mode_wanted, 5, NULL);
+	pio_mode = ide_get_best_pio_mode(drive, mode_wanted, 5);
 	cycle_time = ide_pio_cycle_time(drive, pio_mode);
 
 	cmdprintk("%s: PIO mode wanted %d, selected %d (%d ns)\n",
diff --git a/drivers/ide/pci/cs5520.c b/drivers/ide/pci/cs5520.c
index 5539a25eae99..ee10e48d7cef 100644
--- a/drivers/ide/pci/cs5520.c
+++ b/drivers/ide/pci/cs5520.c
@@ -126,7 +126,7 @@ static int cs5520_tune_chipset(ide_drive_t *drive, u8 xferspeed)
 	
 static void cs5520_tune_drive(ide_drive_t *drive, u8 pio)
 {
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 	cs5520_tune_chipset(drive, (XFER_PIO_0 + pio));
 }
 
diff --git a/drivers/ide/pci/cs5530.c b/drivers/ide/pci/cs5530.c
index 643d66cc6729..a75c14a93182 100644
--- a/drivers/ide/pci/cs5530.c
+++ b/drivers/ide/pci/cs5530.c
@@ -82,7 +82,7 @@ static void cs5530_tunepio(ide_drive_t *drive, u8 pio)
 
 static void cs5530_tuneproc (ide_drive_t *drive, u8 pio)	/* pio=255 means "autotune" */
 {
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 
 	if (cs5530_set_xfer_mode(drive, XFER_PIO_0 + pio) == 0)
 		cs5530_tunepio(drive, pio);
diff --git a/drivers/ide/pci/cs5535.c b/drivers/ide/pci/cs5535.c
index 503979372144..0bff4005292a 100644
--- a/drivers/ide/pci/cs5535.c
+++ b/drivers/ide/pci/cs5535.c
@@ -89,7 +89,7 @@ static void cs5535_set_speed(ide_drive_t *drive, u8 speed)
 
 		pioa = speed - XFER_PIO_0;
 		piob = ide_get_best_pio_mode(&(drive->hwif->drives[!unit]),
-						255, 4, NULL);
+						255, 4);
 		cmd = pioa < piob ? pioa : piob;
 
 		/* Write the speed of the current drive */
@@ -159,7 +159,7 @@ static void cs5535_tuneproc(ide_drive_t *drive, u8 xferspeed)
 	/* cs5535 max pio is pio 4, best_pio will check the blacklist.
 	i think we don't need to rate_filter the incoming xferspeed
 	since we know we're only going to choose pio */
-	xferspeed = ide_get_best_pio_mode(drive, xferspeed, 4, NULL);
+	xferspeed = ide_get_best_pio_mode(drive, xferspeed, 4);
 	ide_config_drive_speed(drive, modes[xferspeed]);
 	cs5535_set_speed(drive, xferspeed);
 }
@@ -174,7 +174,7 @@ static int cs5535_dma_check(ide_drive_t *drive)
 		return 0;
 
 	if (ide_use_fast_pio(drive)) {
-		speed = ide_get_best_pio_mode(drive, 255, 4, NULL);
+		speed = ide_get_best_pio_mode(drive, 255, 4);
 		cs5535_set_drive(drive, speed);
 	}
 
diff --git a/drivers/ide/pci/cy82c693.c b/drivers/ide/pci/cy82c693.c
index 995b72563613..cb7c18187e3c 100644
--- a/drivers/ide/pci/cy82c693.c
+++ b/drivers/ide/pci/cy82c693.c
@@ -330,7 +330,7 @@ static void cy82c693_tune_drive (ide_drive_t *drive, u8 pio)
 #endif /* CY82C693_DEBUG_LOGS */
 
 	/* first let's calc the pio modes */
-	pio = ide_get_best_pio_mode(drive, pio, CY82C693_MAX_PIO, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, CY82C693_MAX_PIO);
 
 #if CY82C693_DEBUG_INFO
 	printk (KERN_INFO "%s: Selected PIO mode %d\n", drive->name, pio);
diff --git a/drivers/ide/pci/hpt34x.c b/drivers/ide/pci/hpt34x.c
index 6d2ef0ee0f2b..2e4591303f4f 100644
--- a/drivers/ide/pci/hpt34x.c
+++ b/drivers/ide/pci/hpt34x.c
@@ -80,7 +80,7 @@ static int hpt34x_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 
 static void hpt34x_tune_drive (ide_drive_t *drive, u8 pio)
 {
-	pio = ide_get_best_pio_mode(drive, pio, 5, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 5);
 	(void) hpt34x_tune_chipset(drive, (XFER_PIO_0 + pio));
 }
 
diff --git a/drivers/ide/pci/hpt366.c b/drivers/ide/pci/hpt366.c
index 182346a04f36..0a3fe9471eb6 100644
--- a/drivers/ide/pci/hpt366.c
+++ b/drivers/ide/pci/hpt366.c
@@ -652,7 +652,7 @@ static int hpt3xx_tune_chipset(ide_drive_t *drive, u8 speed)
 
 static void hpt3xx_tune_drive(ide_drive_t *drive, u8 pio)
 {
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 	(void) hpt3xx_tune_chipset (drive, XFER_PIO_0 + pio);
 }
 
diff --git a/drivers/ide/pci/it8213.c b/drivers/ide/pci/it8213.c
index 684b0ec79f41..d8425a2499bf 100644
--- a/drivers/ide/pci/it8213.c
+++ b/drivers/ide/pci/it8213.c
@@ -82,7 +82,7 @@ static void it8213_tuneproc (ide_drive_t *drive, u8 pio)
 					{ 2, 1 },
 					{ 2, 3 }, };
 
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 
 	spin_lock_irqsave(&tune_lock, flags);
 	pci_read_config_word(dev, master_port, &master_data);
@@ -214,7 +214,7 @@ static int it8213_config_drive_for_dma (ide_drive_t *drive)
 	if (ide_tune_dma(drive))
 		return 0;
 
-	pio = ide_get_best_pio_mode(drive, 255, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, 255, 4);
 	it8213_tune_chipset(drive, XFER_PIO_0 + pio);
 
 	return -1;
diff --git a/drivers/ide/pci/it821x.c b/drivers/ide/pci/it821x.c
index faccb2d4af43..790233db017b 100644
--- a/drivers/ide/pci/it821x.c
+++ b/drivers/ide/pci/it821x.c
@@ -255,7 +255,7 @@ static int it821x_tunepio(ide_drive_t *drive, u8 set_pio)
 	 * on the cable.
 	 */
 	if (pair) {
-		u8 pair_pio = ide_get_best_pio_mode(pair, 255, 4, NULL);
+		u8 pair_pio = ide_get_best_pio_mode(pair, 255, 4);
 		/* trim PIO to the slowest of the master/slave */
 		if (pair_pio < set_pio)
 			set_pio = pair_pio;
@@ -276,7 +276,7 @@ static int it821x_tunepio(ide_drive_t *drive, u8 set_pio)
 
 static void it821x_tuneproc(ide_drive_t *drive, u8 pio)
 {
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 	(void)it821x_tunepio(drive, pio);
 }
 
diff --git a/drivers/ide/pci/jmicron.c b/drivers/ide/pci/jmicron.c
index 0edb9cd45854..57d3d4186b3f 100644
--- a/drivers/ide/pci/jmicron.c
+++ b/drivers/ide/pci/jmicron.c
@@ -97,7 +97,7 @@ static void jmicron_tuneproc (ide_drive_t *drive, byte mode_wanted)
 
 static void config_jmicron_chipset_for_pio (ide_drive_t *drive, byte set_speed)
 {
-	u8 speed = XFER_PIO_0 + ide_get_best_pio_mode(drive, 255, 5, NULL);
+	u8 speed = XFER_PIO_0 + ide_get_best_pio_mode(drive, 255, 5);
 	if (set_speed)
 		(void) ide_config_drive_speed(drive, speed);
 }
diff --git a/drivers/ide/pci/opti621.c b/drivers/ide/pci/opti621.c
index 78d7adf2d0b0..1802ad0927eb 100644
--- a/drivers/ide/pci/opti621.c
+++ b/drivers/ide/pci/opti621.c
@@ -147,12 +147,12 @@ static void compute_pios(ide_drive_t *drive, u8 pio)
 	int d;
 	ide_hwif_t *hwif = HWIF(drive);
 
-	drive->drive_data = ide_get_best_pio_mode(drive, pio, OPTI621_MAX_PIO, NULL);
+	drive->drive_data = ide_get_best_pio_mode(drive, pio, OPTI621_MAX_PIO);
 	for (d = 0; d < 2; ++d) {
 		drive = &hwif->drives[d];
 		if (drive->present) {
 			if (drive->drive_data == PIO_DONT_KNOW)
-				drive->drive_data = ide_get_best_pio_mode(drive, 255, OPTI621_MAX_PIO, NULL);
+				drive->drive_data = ide_get_best_pio_mode(drive, 255, OPTI621_MAX_PIO);
 #ifdef OPTI621_DEBUG
 			printk("%s: Selected PIO mode %d\n",
 				drive->name, drive->drive_data);
diff --git a/drivers/ide/pci/pdc202xx_new.c b/drivers/ide/pci/pdc202xx_new.c
index f8508d9cccd3..a8a01b4ab607 100644
--- a/drivers/ide/pci/pdc202xx_new.c
+++ b/drivers/ide/pci/pdc202xx_new.c
@@ -219,7 +219,7 @@ static int pdcnew_tune_chipset(ide_drive_t *drive, u8 speed)
 
 static void pdcnew_tune_drive(ide_drive_t *drive, u8 pio)
 {
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 	(void)pdcnew_tune_chipset(drive, XFER_PIO_0 + pio);
 }
 
diff --git a/drivers/ide/pci/pdc202xx_old.c b/drivers/ide/pci/pdc202xx_old.c
index eb4e4a61d06e..3a6882d4aa64 100644
--- a/drivers/ide/pci/pdc202xx_old.c
+++ b/drivers/ide/pci/pdc202xx_old.c
@@ -145,7 +145,7 @@ static int pdc202xx_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 
 static void pdc202xx_tune_drive(ide_drive_t *drive, u8 pio)
 {
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 	pdc202xx_tune_chipset(drive, XFER_PIO_0 + pio);
 }
 
diff --git a/drivers/ide/pci/piix.c b/drivers/ide/pci/piix.c
index a4f88d25b16d..41a3612c6107 100644
--- a/drivers/ide/pci/piix.c
+++ b/drivers/ide/pci/piix.c
@@ -219,7 +219,7 @@ static void piix_tune_pio (ide_drive_t *drive, u8 pio)
  */
 static void piix_tune_drive (ide_drive_t *drive, u8 pio)
 {
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 	piix_tune_pio(drive, pio);
 	(void) ide_config_drive_speed(drive, XFER_PIO_0 + pio);
 }
diff --git a/drivers/ide/pci/sc1200.c b/drivers/ide/pci/sc1200.c
index 9ac8889f8ad3..98e1a2bd9501 100644
--- a/drivers/ide/pci/sc1200.c
+++ b/drivers/ide/pci/sc1200.c
@@ -304,7 +304,7 @@ static void sc1200_tuneproc (ide_drive_t *drive, byte pio)	/* mode=255 means "au
 		return;
 	}
 
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 	printk("SC1200: %s: setting PIO mode%d\n", drive->name, pio);
 
 	if (sc1200_set_xfer_mode(drive, XFER_PIO_0 + pio) == 0)
diff --git a/drivers/ide/pci/scc_pata.c b/drivers/ide/pci/scc_pata.c
index a1954d1c2879..2c76355b92e8 100644
--- a/drivers/ide/pci/scc_pata.c
+++ b/drivers/ide/pci/scc_pata.c
@@ -210,7 +210,7 @@ static void scc_tuneproc(ide_drive_t *drive, byte mode_wanted)
 	unsigned char speed = XFER_PIO_0;
 	int offset;
 
-	mode_wanted = ide_get_best_pio_mode(drive, mode_wanted, 4, NULL);
+	mode_wanted = ide_get_best_pio_mode(drive, mode_wanted, 4);
 	switch (mode_wanted) {
 	case 4:
 		speed = XFER_PIO_4;
diff --git a/drivers/ide/pci/serverworks.c b/drivers/ide/pci/serverworks.c
index abc3cd58aaa9..1fe29d9e68fa 100644
--- a/drivers/ide/pci/serverworks.c
+++ b/drivers/ide/pci/serverworks.c
@@ -205,7 +205,7 @@ static int svwks_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 
 static void svwks_tune_drive (ide_drive_t *drive, u8 pio)
 {
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 	svwks_tune_pio(drive, pio);
 	(void)ide_config_drive_speed(drive, XFER_PIO_0 + pio);
 }
diff --git a/drivers/ide/pci/siimage.c b/drivers/ide/pci/siimage.c
index 47b9b94afe05..63a8a93d63d6 100644
--- a/drivers/ide/pci/siimage.c
+++ b/drivers/ide/pci/siimage.c
@@ -189,7 +189,7 @@ static void sil_tune_pio(ide_drive_t *drive, u8 pio)
 
 	/* trim *taskfile* PIO to the slowest of the master/slave */
 	if (pair->present) {
-		u8 pair_pio = ide_get_best_pio_mode(pair, 255, 4, NULL);
+		u8 pair_pio = ide_get_best_pio_mode(pair, 255, 4);
 
 		if (pair_pio < tf_pio)
 			tf_pio = pair_pio;
@@ -221,7 +221,7 @@ static void sil_tune_pio(ide_drive_t *drive, u8 pio)
 
 static void sil_tuneproc(ide_drive_t *drive, u8 pio)
 {
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 	sil_tune_pio(drive, pio);
 	(void)ide_config_drive_speed(drive, XFER_PIO_0 + pio);
 }
diff --git a/drivers/ide/pci/sis5513.c b/drivers/ide/pci/sis5513.c
index bbc502087578..3c4693695ca3 100644
--- a/drivers/ide/pci/sis5513.c
+++ b/drivers/ide/pci/sis5513.c
@@ -521,7 +521,7 @@ static void config_art_rwp_pio (ide_drive_t *drive, u8 pio)
 
 static int sis5513_tune_drive(ide_drive_t *drive, u8 pio)
 {
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 	config_art_rwp_pio(drive, pio);
 	return ide_config_drive_speed(drive, XFER_PIO_0 + pio);
 }
diff --git a/drivers/ide/pci/sl82c105.c b/drivers/ide/pci/sl82c105.c
index cc5f69b7514f..4e8f32e643a6 100644
--- a/drivers/ide/pci/sl82c105.c
+++ b/drivers/ide/pci/sl82c105.c
@@ -83,7 +83,7 @@ static u8 sl82c105_tune_pio(ide_drive_t *drive, u8 pio)
 
 	DBG(("sl82c105_tune_pio(drive:%s, pio:%u)\n", drive->name, pio));
 
-	pio = ide_get_best_pio_mode(drive, pio, 5, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 5);
 
 	drv_ctrl = get_pio_timings(drive, pio);
 
diff --git a/drivers/ide/pci/slc90e66.c b/drivers/ide/pci/slc90e66.c
index 115bcaefc8ee..562747fbee39 100644
--- a/drivers/ide/pci/slc90e66.c
+++ b/drivers/ide/pci/slc90e66.c
@@ -103,7 +103,7 @@ static void slc90e66_tune_pio (ide_drive_t *drive, u8 pio)
 
 static void slc90e66_tune_drive (ide_drive_t *drive, u8 pio)
 {
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 	slc90e66_tune_pio(drive, pio);
 	(void) ide_config_drive_speed(drive, XFER_PIO_0 + pio);
 }
diff --git a/drivers/ide/pci/tc86c001.c b/drivers/ide/pci/tc86c001.c
index 1d40b0820f50..2479a19f0094 100644
--- a/drivers/ide/pci/tc86c001.c
+++ b/drivers/ide/pci/tc86c001.c
@@ -47,7 +47,7 @@ static int tc86c001_tune_chipset(ide_drive_t *drive, u8 speed)
 
 static void tc86c001_tune_drive(ide_drive_t *drive, u8 pio)
 {
-	pio =  ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 	(void) tc86c001_tune_chipset(drive, XFER_PIO_0 + pio);
 }
 
diff --git a/drivers/ide/pci/triflex.c b/drivers/ide/pci/triflex.c
index 801314fe3524..f8fef905bacc 100644
--- a/drivers/ide/pci/triflex.c
+++ b/drivers/ide/pci/triflex.c
@@ -96,7 +96,7 @@ static int triflex_tune_chipset(ide_drive_t *drive, u8 xferspeed)
 
 static void triflex_tune_drive(ide_drive_t *drive, u8 pio)
 {
-	int use_pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	int use_pio = ide_get_best_pio_mode(drive, pio, 4);
 	(void) triflex_tune_chipset(drive, (XFER_PIO_0 + use_pio));
 }
 
diff --git a/drivers/ide/ppc/mpc8xx.c b/drivers/ide/ppc/mpc8xx.c
index 82de2d781f2e..f3789c0e6cc9 100644
--- a/drivers/ide/ppc/mpc8xx.c
+++ b/drivers/ide/ppc/mpc8xx.c
@@ -431,13 +431,12 @@ void m8xx_ide_init_hwif_ports (hw_regs_t *hw,
 static void
 m8xx_ide_tuneproc(ide_drive_t *drive, u8 pio)
 {
-	ide_pio_data_t d;
 #if defined(CONFIG_IDE_8xx_PCCARD) || defined(CONFIG_IDE_8xx_DIRECT)
 	volatile pcmconf8xx_t	*pcmp;
 	ulong timing, mask, reg;
 #endif
 
-	pio = ide_get_best_pio_mode(drive, pio, 4, &d);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 
 #if 1
 	printk("%s[%d] %s: best PIO mode: %d\n",
diff --git a/drivers/ide/ppc/pmac.c b/drivers/ide/ppc/pmac.c
index 669330521fc1..debaa8823f51 100644
--- a/drivers/ide/ppc/pmac.c
+++ b/drivers/ide/ppc/pmac.c
@@ -627,7 +627,7 @@ pmac_ide_tuneproc(ide_drive_t *drive, u8 pio)
 	/* which drive is it ? */
 	timings = &pmif->timings[drive->select.b.unit & 0x01];
 
-	pio = ide_get_best_pio_mode(drive, pio, 4, NULL);
+	pio = ide_get_best_pio_mode(drive, pio, 4);
 	cycle_time = ide_pio_cycle_time(drive, pio);
 
 	switch (pmif->kind) {
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 0afa52c14ffa..14a87f619d17 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1377,12 +1377,8 @@ typedef struct ide_pio_timings_s {
 				/* active + recovery (+ setup for some chips) */
 } ide_pio_timings_t;
 
-typedef struct ide_pio_data_s {
-	u8 pio_mode;
-} ide_pio_data_t;
-
 unsigned int ide_pio_cycle_time(ide_drive_t *, u8);
-extern u8 ide_get_best_pio_mode (ide_drive_t *drive, u8 mode_wanted, u8 max_mode, ide_pio_data_t *d);
+u8 ide_get_best_pio_mode(ide_drive_t *, u8, u8);
 extern const ide_pio_timings_t ide_pio_timings[6];
 
 
-- 
cgit v1.3-8-gc7d7


From 6a824c92db4d606c324272c4eed366fb71672440 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 20 Jul 2007 01:11:58 +0200
Subject: ide: remove ide_find_best_pio_mode()

* Add ->host_flags to ide_hwif_t to store ide_pci_device_t.host_flags,
  assign it in setup-pci.c:ide_pci_setup_ports().

* Add IDE_HFLAG_PIO_NO_{BLACKLIST,DOWNGRADE} to ide_pci_device_t.host_flags
  and teach ide_get_best_pio_mode() about them.  Also remove needless
  !drive->id check while at it (drive->id is always present).

* Convert amd74xx, via82cxxx and ide-timing.h to use ide_get_best_pio_mode()
  and then remove no longer needed ide_find_best_pio_mode().

There should be no functionality changes caused by this patch.

Acked-by: Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-lib.c       | 13 +++++++------
 drivers/ide/ide-timing.h    | 18 ++----------------
 drivers/ide/ide.c           |  2 ++
 drivers/ide/pci/amd74xx.c   | 20 ++++++++++++--------
 drivers/ide/pci/via82cxxx.c | 22 +++++++++++++---------
 drivers/ide/setup-pci.c     |  2 ++
 include/linux/ide.h         |  7 +++++++
 7 files changed, 45 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index df29868209f3..92a6c7bcf527 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -291,11 +291,11 @@ u8 ide_get_best_pio_mode (ide_drive_t *drive, u8 mode_wanted, u8 max_mode)
 	struct hd_driveid* id = drive->id;
 	int overridden  = 0;
 
-	if (mode_wanted != 255) {
-		pio_mode = mode_wanted;
-	} else if (!drive->id) {
-		pio_mode = 0;
-	} else if ((pio_mode = ide_scan_pio_blacklist(id->model)) != -1) {
+	if (mode_wanted != 255)
+		return min_t(u8, mode_wanted, max_mode);
+
+	if ((drive->hwif->host_flags & IDE_HFLAG_PIO_NO_BLACKLIST) == 0 &&
+	    (pio_mode = ide_scan_pio_blacklist(id->model)) != -1) {
 		printk(KERN_INFO "%s: is on PIO blacklist\n", drive->name);
 	} else {
 		pio_mode = id->tPIO;
@@ -324,7 +324,8 @@ u8 ide_get_best_pio_mode (ide_drive_t *drive, u8 mode_wanted, u8 max_mode)
 		/*
 		 * Conservative "downgrade" for all pre-ATA2 drives
 		 */
-		if (pio_mode && pio_mode < 4) {
+		if ((drive->hwif->host_flags & IDE_HFLAG_PIO_NO_DOWNGRADE) == 0 &&
+		    pio_mode && pio_mode < 4) {
 			pio_mode--;
 			printk(KERN_INFO "%s: applying conservative "
 					 "PIO \"downgrade\"\n", drive->name);
diff --git a/drivers/ide/ide-timing.h b/drivers/ide/ide-timing.h
index 9b5afebafbf4..daffbb9797e1 100644
--- a/drivers/ide/ide-timing.h
+++ b/drivers/ide/ide-timing.h
@@ -106,21 +106,6 @@ static struct ide_timing ide_timing[] = {
 #define XFER_EPIO	0x01
 #define XFER_PIO	0x00
 
-static short ide_find_best_pio_mode(ide_drive_t *drive)
-{
-	struct hd_driveid *id = drive->id;
-	short best = 0;
-
-	/* EIDE PIO modes */
-	if ((id->field_valid & 2) && (id->capability & 8)) {
-		if ((best = (drive->id->eide_pio_modes & 4) ? XFER_PIO_5 :
-			    (drive->id->eide_pio_modes & 2) ? XFER_PIO_4 :
-			    (drive->id->eide_pio_modes & 1) ? XFER_PIO_3 : 0)) return best;
-	}
-
-	return XFER_PIO_0 + min_t(u8, id->tPIO, 2);
-}
-
 static void ide_timing_quantize(struct ide_timing *t, struct ide_timing *q, int T, int UT)
 {
 	q->setup   = EZ(t->setup   * 1000,  T);
@@ -210,7 +195,8 @@ static int ide_timing_compute(ide_drive_t *drive, short speed, struct ide_timing
  */
 
 	if ((speed & XFER_MODE) != XFER_PIO) {
-		ide_timing_compute(drive, ide_find_best_pio_mode(drive), &p, T, UT);
+		u8 pio = ide_get_best_pio_mode(drive, 255, 5);
+		ide_timing_compute(drive, XFER_PIO_0 + pio, &p, T, UT);
 		ide_timing_merge(&p, t, t, IDE_TIMING_ALL);
 	}
 
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 077fb674a96d..b442b341d52e 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -455,6 +455,8 @@ static void ide_hwif_restore(ide_hwif_t *hwif, ide_hwif_t *tmp_hwif)
 	hwif->straight8			= tmp_hwif->straight8;
 	hwif->bus_state			= tmp_hwif->bus_state;
 
+	hwif->host_flags		= tmp_hwif->host_flags;
+
 	hwif->atapi_dma			= tmp_hwif->atapi_dma;
 	hwif->ultra_mask		= tmp_hwif->ultra_mask;
 	hwif->mwdma_mask		= tmp_hwif->mwdma_mask;
diff --git a/drivers/ide/pci/amd74xx.c b/drivers/ide/pci/amd74xx.c
index 9c3ea90aeb8b..5ed1d485fc7e 100644
--- a/drivers/ide/pci/amd74xx.c
+++ b/drivers/ide/pci/amd74xx.c
@@ -1,5 +1,5 @@
 /*
- * Version 2.20
+ * Version 2.21
  *
  * AMD 755/756/766/8111 and nVidia nForce/2/2s/3/3s/CK804/MCP04
  * IDE driver for Linux.
@@ -272,10 +272,8 @@ static int amd_set_drive(ide_drive_t *drive, u8 speed)
 
 static void amd74xx_tune_drive(ide_drive_t *drive, u8 pio)
 {
-	if (pio == 255) {
-		amd_set_drive(drive, ide_find_best_pio_mode(drive));
-		return;
-	}
+	if (pio == 255)
+		pio = ide_get_best_pio_mode(drive, 255, 5);
 
 	amd_set_drive(drive, XFER_PIO_0 + min_t(byte, pio, 5));
 }
@@ -284,12 +282,14 @@ static int amd74xx_ide_dma_check(ide_drive_t *drive)
 {
 	u8 speed = ide_max_dma_mode(drive);
 
-	if (speed == 0)
-		speed = ide_find_best_pio_mode(drive);
+	if (speed == 0) {
+		amd74xx_tune_drive(drive, 255);
+		return -1;
+	}
 
 	amd_set_drive(drive, speed);
 
-	if (drive->autodma && (speed & XFER_MODE) != XFER_PIO)
+	if (drive->autodma)
 		return 0;
 
 	return -1;
@@ -451,6 +451,8 @@ static void __devinit init_hwif_amd74xx(ide_hwif_t *hwif)
 		.autodma	= AUTODMA,				\
 		.enablebits	= {{0x40,0x02,0x02}, {0x40,0x01,0x01}},	\
 		.bootable	= ON_BOARD,				\
+		.host_flags	= IDE_HFLAG_PIO_NO_BLACKLIST		\
+				| IDE_HFLAG_PIO_NO_DOWNGRADE,		\
 	}
 
 #define DECLARE_NV_DEV(name_str)					\
@@ -461,6 +463,8 @@ static void __devinit init_hwif_amd74xx(ide_hwif_t *hwif)
 		.autodma	= AUTODMA,				\
 		.enablebits	= {{0x50,0x02,0x02}, {0x50,0x01,0x01}},	\
 		.bootable	= ON_BOARD,				\
+		.host_flags	= IDE_HFLAG_PIO_NO_BLACKLIST		\
+				| IDE_HFLAG_PIO_NO_DOWNGRADE,		\
 	}
 
 static ide_pci_device_t amd74xx_chipsets[] __devinitdata = {
diff --git a/drivers/ide/pci/via82cxxx.c b/drivers/ide/pci/via82cxxx.c
index bfc9b67f8c92..b107ee3588f7 100644
--- a/drivers/ide/pci/via82cxxx.c
+++ b/drivers/ide/pci/via82cxxx.c
@@ -1,6 +1,6 @@
 /*
  *
- * Version 3.45
+ * Version 3.46
  *
  * VIA IDE driver for Linux. Supported southbridges:
  *
@@ -203,10 +203,8 @@ static int via_set_drive(ide_drive_t *drive, u8 speed)
 
 static void via82cxxx_tune_drive(ide_drive_t *drive, u8 pio)
 {
-	if (pio == 255) {
-		via_set_drive(drive, ide_find_best_pio_mode(drive));
-		return;
-	}
+	if (pio == 255)
+		pio = ide_get_best_pio_mode(drive, 255, 5);
 
 	via_set_drive(drive, XFER_PIO_0 + min_t(u8, pio, 5));
 }
@@ -223,12 +221,14 @@ static int via82cxxx_ide_dma_check (ide_drive_t *drive)
 {
 	u8 speed = ide_max_dma_mode(drive);
 
-	if (speed == 0)
-		speed = ide_find_best_pio_mode(drive);
+	if (speed == 0) {
+		via82cxxx_tune_drive(drive, 255);
+		return -1;
+	}
 
 	via_set_drive(drive, speed);
 
-	if (drive->autodma && (speed & XFER_MODE) != XFER_PIO)
+	if (drive->autodma)
 		return 0;
 
 	return -1;
@@ -500,7 +500,9 @@ static ide_pci_device_t via82cxxx_chipsets[] __devinitdata = {
 		.init_hwif	= init_hwif_via82cxxx,
 		.autodma	= NOAUTODMA,
 		.enablebits	= {{0x40,0x02,0x02}, {0x40,0x01,0x01}},
-		.bootable	= ON_BOARD
+		.bootable	= ON_BOARD,
+		.host_flags	= IDE_HFLAG_PIO_NO_BLACKLIST
+				| IDE_HFLAG_PIO_NO_DOWNGRADE,
 	},{	/* 1 */
 		.name		= "VP_IDE",
 		.init_chipset	= init_chipset_via82cxxx,
@@ -508,6 +510,8 @@ static ide_pci_device_t via82cxxx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		.bootable	= ON_BOARD,
+		.host_flags	= IDE_HFLAG_PIO_NO_BLACKLIST
+				| IDE_HFLAG_PIO_NO_DOWNGRADE,
 	}
 };
 
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index bfe1f4e59597..e9f3267456e2 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -613,6 +613,8 @@ void ide_pci_setup_ports(struct pci_dev *dev, ide_pci_device_t *d, int pciirq, a
 		else
 			ide_hwif_setup_dma(dev, d, hwif);
 bypass_legacy_dma:
+		hwif->host_flags = d->host_flags;
+
 		if (d->init_hwif)
 			/* Call chipset-specific routine
 			 * for each enabled hwif
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 14a87f619d17..9f72f6e0c954 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -681,6 +681,8 @@ typedef struct hwif_s {
 	u8 straight8;	/* Alan's straight 8 check */
 	u8 bus_state;	/* power state of the IDE bus */
 
+	u8 host_flags;
+
 	u8 atapi_dma;	/* host supports atapi_dma */
 	u8 ultra_mask;
 	u8 mwdma_mask;
@@ -1245,7 +1247,12 @@ typedef struct ide_pci_enablebit_s {
 enum {
 	/* Uses ISA control ports not PCI ones. */
 	IDE_HFLAG_ISA_PORTS		= (1 << 0),
+	/* single port device */
 	IDE_HFLAG_SINGLE		= (1 << 1),
+	/* don't use legacy PIO blacklist */
+	IDE_HFLAG_PIO_NO_BLACKLIST	= (1 << 2),
+	/* don't use conservative PIO "downgrade" */
+	IDE_HFLAG_PIO_NO_DOWNGRADE	= (1 << 3),
 };
 
 typedef struct ide_pci_device_s {
-- 
cgit v1.3-8-gc7d7


From 4099d14322149c7a467e4997b87be4ba8eb78697 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 20 Jul 2007 01:11:59 +0200
Subject: ide: add PIO masks

* Add ATA_PIO[0-6] defines to <linux/ata.h>.

* Add ->pio_mask field to ide_pci_device_t and ide_hwif_t.

* Add PIO masks to host drivers.

<linux/ata.h> change ACK-ed by Jeff Garzik <jeff@garzik.org>.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/cris/ide-cris.c    |  1 +
 drivers/ide/ide.c              |  2 ++
 drivers/ide/legacy/ali14xx.c   |  2 ++
 drivers/ide/legacy/dtc2278.c   |  1 +
 drivers/ide/legacy/ht6560b.c   |  2 ++
 drivers/ide/legacy/qd65xx.c    |  1 +
 drivers/ide/legacy/umc8672.c   |  2 ++
 drivers/ide/mips/au1xxx-ide.c  |  2 ++
 drivers/ide/pci/aec62xx.c      |  5 +++++
 drivers/ide/pci/alim15x3.c     |  1 +
 drivers/ide/pci/amd74xx.c      |  2 ++
 drivers/ide/pci/atiixp.c       |  2 ++
 drivers/ide/pci/cmd640.c       |  2 ++
 drivers/ide/pci/cmd64x.c       |  4 ++++
 drivers/ide/pci/cs5520.c       |  1 +
 drivers/ide/pci/cs5530.c       |  1 +
 drivers/ide/pci/cs5535.c       |  1 +
 drivers/ide/pci/cy82c693.c     |  1 +
 drivers/ide/pci/hpt34x.c       |  3 ++-
 drivers/ide/pci/hpt366.c       | 18 ++++++++++++------
 drivers/ide/pci/it8213.c       |  1 +
 drivers/ide/pci/it821x.c       |  3 ++-
 drivers/ide/pci/jmicron.c      |  1 +
 drivers/ide/pci/opti621.c      |  2 ++
 drivers/ide/pci/pdc202xx_new.c |  7 +++++++
 drivers/ide/pci/pdc202xx_old.c |  5 +++++
 drivers/ide/pci/piix.c         |  2 ++
 drivers/ide/pci/sc1200.c       |  1 +
 drivers/ide/pci/scc_pata.c     |  1 +
 drivers/ide/pci/serverworks.c  |  5 +++++
 drivers/ide/pci/sgiioc4.c      |  1 +
 drivers/ide/pci/siimage.c      |  1 +
 drivers/ide/pci/sis5513.c      |  1 +
 drivers/ide/pci/sl82c105.c     |  1 +
 drivers/ide/pci/slc90e66.c     |  1 +
 drivers/ide/pci/tc86c001.c     |  1 +
 drivers/ide/pci/triflex.c      |  1 +
 drivers/ide/pci/via82cxxx.c    |  2 ++
 drivers/ide/ppc/mpc8xx.c       |  2 ++
 drivers/ide/ppc/pmac.c         |  1 +
 drivers/ide/setup-pci.c        |  1 +
 include/linux/ata.h            |  9 +++++++++
 include/linux/ide.h            |  3 +++
 43 files changed, 99 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/cris/ide-cris.c b/drivers/ide/cris/ide-cris.c
index e8cd86e86433..9361154ba5c2 100644
--- a/drivers/ide/cris/ide-cris.c
+++ b/drivers/ide/cris/ide-cris.c
@@ -814,6 +814,7 @@ init_e100_ide (void)
 		hwif->dma_host_on = &cris_dma_on;
 		hwif->dma_off_quietly = &cris_dma_off;
 		hwif->cbl = ATA_CBL_PATA40;
+		hwif->pio_mask = ATA_PIO4,
 		hwif->ultra_mask = cris_ultra_mask;
 		hwif->mwdma_mask = 0x07; /* Multiword DMA 0-2 */
 		hwif->autodma = 1;
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index b442b341d52e..f3ea5ea41fd4 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -457,6 +457,8 @@ static void ide_hwif_restore(ide_hwif_t *hwif, ide_hwif_t *tmp_hwif)
 
 	hwif->host_flags		= tmp_hwif->host_flags;
 
+	hwif->pio_mask			= tmp_hwif->pio_mask;
+
 	hwif->atapi_dma			= tmp_hwif->atapi_dma;
 	hwif->ultra_mask		= tmp_hwif->ultra_mask;
 	hwif->mwdma_mask		= tmp_hwif->mwdma_mask;
diff --git a/drivers/ide/legacy/ali14xx.c b/drivers/ide/legacy/ali14xx.c
index d5c7a57b71c1..9b9c4761cb7d 100644
--- a/drivers/ide/legacy/ali14xx.c
+++ b/drivers/ide/legacy/ali14xx.c
@@ -211,10 +211,12 @@ static int __init ali14xx_probe(void)
 	mate = &ide_hwifs[1];
 
 	hwif->chipset = ide_ali14xx;
+	hwif->pio_mask = ATA_PIO4;
 	hwif->tuneproc = &ali14xx_tune_drive;
 	hwif->mate = mate;
 
 	mate->chipset = ide_ali14xx;
+	mate->pio_mask = ATA_PIO4;
 	mate->tuneproc = &ali14xx_tune_drive;
 	mate->mate = hwif;
 	mate->channel = 1;
diff --git a/drivers/ide/legacy/dtc2278.c b/drivers/ide/legacy/dtc2278.c
index 8c4c27e5dc10..6c01d951d074 100644
--- a/drivers/ide/legacy/dtc2278.c
+++ b/drivers/ide/legacy/dtc2278.c
@@ -123,6 +123,7 @@ static int __init dtc2278_probe(void)
 
 	hwif->serialized = 1;
 	hwif->chipset = ide_dtc2278;
+	hwif->pio_mask = ATA_PIO4;
 	hwif->tuneproc = &tune_dtc2278;
 	hwif->drives[0].no_unmask = 1;
 	hwif->drives[1].no_unmask = 1;
diff --git a/drivers/ide/legacy/ht6560b.c b/drivers/ide/legacy/ht6560b.c
index 82ed37df9566..bfaa2025173b 100644
--- a/drivers/ide/legacy/ht6560b.c
+++ b/drivers/ide/legacy/ht6560b.c
@@ -333,12 +333,14 @@ int __init ht6560b_init(void)
 
 	hwif->chipset = ide_ht6560b;
 	hwif->selectproc = &ht6560b_selectproc;
+	hwif->pio_mask = ATA_PIO5;
 	hwif->tuneproc = &tune_ht6560b;
 	hwif->serialized = 1;	/* is this needed? */
 	hwif->mate = mate;
 
 	mate->chipset = ide_ht6560b;
 	mate->selectproc = &ht6560b_selectproc;
+	mate->pio_mask = ATA_PIO5;
 	mate->tuneproc = &tune_ht6560b;
 	mate->serialized = 1;	/* is this needed? */
 	mate->mate = hwif;
diff --git a/drivers/ide/legacy/qd65xx.c b/drivers/ide/legacy/qd65xx.c
index 39145102b348..8b87a424094a 100644
--- a/drivers/ide/legacy/qd65xx.c
+++ b/drivers/ide/legacy/qd65xx.c
@@ -346,6 +346,7 @@ static void __init qd_setup(ide_hwif_t *hwif, int base, int config,
 	hwif->drives[1].drive_data = data1;
 	hwif->drives[0].io_32bit =
 	hwif->drives[1].io_32bit = 1;
+	hwif->pio_mask = ATA_PIO4;
 	hwif->tuneproc = tuneproc;
 	probe_hwif_init(hwif);
 }
diff --git a/drivers/ide/legacy/umc8672.c b/drivers/ide/legacy/umc8672.c
index caeebd41081f..d2862e638bc5 100644
--- a/drivers/ide/legacy/umc8672.c
+++ b/drivers/ide/legacy/umc8672.c
@@ -149,10 +149,12 @@ static int __init umc8672_probe(void)
 	mate = &ide_hwifs[1];
 
 	hwif->chipset = ide_umc8672;
+	hwif->pio_mask = ATA_PIO4;
 	hwif->tuneproc = &tune_umc;
 	hwif->mate = mate;
 
 	mate->chipset = ide_umc8672;
+	mate->pio_mask = ATA_PIO4;
 	mate->tuneproc = &tune_umc;
 	mate->mate = hwif;
 	mate->channel = 1;
diff --git a/drivers/ide/mips/au1xxx-ide.c b/drivers/ide/mips/au1xxx-ide.c
index b0d13c34d310..2ba6a054b861 100644
--- a/drivers/ide/mips/au1xxx-ide.c
+++ b/drivers/ide/mips/au1xxx-ide.c
@@ -692,6 +692,8 @@ static int au_ide_probe(struct device *dev)
 	hwif->swdma_mask                = 0x0;
 #endif
 
+	hwif->pio_mask = ATA_PIO4;
+
 	hwif->noprobe = 0;
 	hwif->drives[0].unmask          = 1;
 	hwif->drives[1].unmask          = 1;
diff --git a/drivers/ide/pci/aec62xx.c b/drivers/ide/pci/aec62xx.c
index 8396d711f232..74432830abf7 100644
--- a/drivers/ide/pci/aec62xx.c
+++ b/drivers/ide/pci/aec62xx.c
@@ -268,6 +268,7 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
 		.bootable	= OFF_BOARD,
+		.pio_mask	= ATA_PIO4,
 		.udma_mask	= 0x07, /* udma0-2 */
 	},{	/* 1 */
 		.name		= "AEC6260",
@@ -276,6 +277,7 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 		.init_hwif	= init_hwif_aec62xx,
 		.autodma	= NOAUTODMA,
 		.bootable	= OFF_BOARD,
+		.pio_mask	= ATA_PIO4,
 		.udma_mask	= 0x1f, /* udma0-4 */
 	},{	/* 2 */
 		.name		= "AEC6260R",
@@ -285,6 +287,7 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
 		.bootable	= NEVER_BOARD,
+		.pio_mask	= ATA_PIO4,
 		.udma_mask	= 0x1f, /* udma0-4 */
 	},{	/* 3 */
 		.name		= "AEC6280",
@@ -293,6 +296,7 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 		.init_hwif	= init_hwif_aec62xx,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
+		.pio_mask	= ATA_PIO4,
 		.udma_mask	= 0x3f, /* udma0-5 */
 	},{	/* 4 */
 		.name		= "AEC6280R",
@@ -302,6 +306,7 @@ static ide_pci_device_t aec62xx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
 		.bootable	= OFF_BOARD,
+		.pio_mask	= ATA_PIO4,
 		.udma_mask	= 0x3f, /* udma0-5 */
 	}
 };
diff --git a/drivers/ide/pci/alim15x3.c b/drivers/ide/pci/alim15x3.c
index a0c43a91b0f3..5511c86733dc 100644
--- a/drivers/ide/pci/alim15x3.c
+++ b/drivers/ide/pci/alim15x3.c
@@ -818,6 +818,7 @@ static ide_pci_device_t ali15x3_chipset __devinitdata = {
 	.init_dma	= init_dma_ali15x3,
 	.autodma	= AUTODMA,
 	.bootable	= ON_BOARD,
+	.pio_mask	= ATA_PIO5,
 };
 
 /**
diff --git a/drivers/ide/pci/amd74xx.c b/drivers/ide/pci/amd74xx.c
index 5ed1d485fc7e..06c15a6a3e7d 100644
--- a/drivers/ide/pci/amd74xx.c
+++ b/drivers/ide/pci/amd74xx.c
@@ -453,6 +453,7 @@ static void __devinit init_hwif_amd74xx(ide_hwif_t *hwif)
 		.bootable	= ON_BOARD,				\
 		.host_flags	= IDE_HFLAG_PIO_NO_BLACKLIST		\
 				| IDE_HFLAG_PIO_NO_DOWNGRADE,		\
+		.pio_mask	= ATA_PIO5,				\
 	}
 
 #define DECLARE_NV_DEV(name_str)					\
@@ -465,6 +466,7 @@ static void __devinit init_hwif_amd74xx(ide_hwif_t *hwif)
 		.bootable	= ON_BOARD,				\
 		.host_flags	= IDE_HFLAG_PIO_NO_BLACKLIST		\
 				| IDE_HFLAG_PIO_NO_DOWNGRADE,		\
+		.pio_mask	= ATA_PIO5,				\
 	}
 
 static ide_pci_device_t amd74xx_chipsets[] __devinitdata = {
diff --git a/drivers/ide/pci/atiixp.c b/drivers/ide/pci/atiixp.c
index a4740e4776fd..1725aa402d98 100644
--- a/drivers/ide/pci/atiixp.c
+++ b/drivers/ide/pci/atiixp.c
@@ -294,6 +294,7 @@ static ide_pci_device_t atiixp_pci_info[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x48,0x01,0x00}, {0x48,0x08,0x00}},
 		.bootable	= ON_BOARD,
+		.pio_mask	= ATA_PIO4,
 	},{	/* 1 */
 		.name		= "SB600_PATA",
 		.init_hwif	= init_hwif_atiixp,
@@ -301,6 +302,7 @@ static ide_pci_device_t atiixp_pci_info[] __devinitdata = {
 		.enablebits	= {{0x48,0x01,0x00}, {0x00,0x00,0x00}},
  		.bootable	= ON_BOARD,
  		.host_flags	= IDE_HFLAG_SINGLE,
+		.pio_mask	= ATA_PIO4,
  	},
 };
 
diff --git a/drivers/ide/pci/cmd640.c b/drivers/ide/pci/cmd640.c
index 335482981a92..9689494efa24 100644
--- a/drivers/ide/pci/cmd640.c
+++ b/drivers/ide/pci/cmd640.c
@@ -766,6 +766,7 @@ int __init ide_probe_for_cmd640x (void)
 	       cmd_hwif0->name, 'a' + cmd640_chip_version - 1, bus_type, cfr);
 	cmd_hwif0->chipset = ide_cmd640;
 #ifdef CONFIG_BLK_DEV_CMD640_ENHANCED
+	cmd_hwif0->pio_mask = ATA_PIO5;
 	cmd_hwif0->tuneproc = &cmd640_tune_drive;
 #endif /* CONFIG_BLK_DEV_CMD640_ENHANCED */
 
@@ -821,6 +822,7 @@ int __init ide_probe_for_cmd640x (void)
 		cmd_hwif1->mate = cmd_hwif0;
 		cmd_hwif1->channel = 1;
 #ifdef CONFIG_BLK_DEV_CMD640_ENHANCED
+		cmd_hwif1->pio_mask = ATA_PIO5;
 		cmd_hwif1->tuneproc = &cmd640_tune_drive;
 #endif /* CONFIG_BLK_DEV_CMD640_ENHANCED */
 	}
diff --git a/drivers/ide/pci/cmd64x.c b/drivers/ide/pci/cmd64x.c
index 9cf969b61bf2..19633c5aba15 100644
--- a/drivers/ide/pci/cmd64x.c
+++ b/drivers/ide/pci/cmd64x.c
@@ -622,6 +622,7 @@ static ide_pci_device_t cmd64x_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x00,0x00,0x00}, {0x51,0x08,0x08}},
 		.bootable	= ON_BOARD,
+		.pio_mask	= ATA_PIO5,
 		.udma_mask	= 0x00, /* no udma */
 	},{	/* 1 */
 		.name		= "CMD646",
@@ -631,6 +632,7 @@ static ide_pci_device_t cmd64x_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x51,0x04,0x04}, {0x51,0x08,0x08}},
 		.bootable	= ON_BOARD,
+		.pio_mask	= ATA_PIO5,
 		.udma_mask	= 0x07, /* udma0-2 */
 	},{	/* 2 */
 		.name		= "CMD648",
@@ -640,6 +642,7 @@ static ide_pci_device_t cmd64x_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x51,0x04,0x04}, {0x51,0x08,0x08}},
 		.bootable	= ON_BOARD,
+		.pio_mask	= ATA_PIO5,
 		.udma_mask	= 0x1f, /* udma0-4 */
 	},{	/* 3 */
 		.name		= "CMD649",
@@ -649,6 +652,7 @@ static ide_pci_device_t cmd64x_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x51,0x04,0x04}, {0x51,0x08,0x08}},
 		.bootable	= ON_BOARD,
+		.pio_mask	= ATA_PIO5,
 		.udma_mask	= 0x3f, /* udma0-5 */
 	}
 };
diff --git a/drivers/ide/pci/cs5520.c b/drivers/ide/pci/cs5520.c
index ee10e48d7cef..bccedf9b8b28 100644
--- a/drivers/ide/pci/cs5520.c
+++ b/drivers/ide/pci/cs5520.c
@@ -197,6 +197,7 @@ static void __devinit init_hwif_cs5520(ide_hwif_t *hwif)
 		.autodma	= AUTODMA,			\
 		.bootable	= ON_BOARD,			\
 		.host_flags	= IDE_HFLAG_ISA_PORTS,		\
+		.pio_mask	= ATA_PIO4,			\
 	}
 
 static ide_pci_device_t cyrix_chipsets[] __devinitdata = {
diff --git a/drivers/ide/pci/cs5530.c b/drivers/ide/pci/cs5530.c
index a75c14a93182..acaf71fd4c09 100644
--- a/drivers/ide/pci/cs5530.c
+++ b/drivers/ide/pci/cs5530.c
@@ -343,6 +343,7 @@ static ide_pci_device_t cs5530_chipset __devinitdata = {
 	.init_hwif	= init_hwif_cs5530,
 	.autodma	= AUTODMA,
 	.bootable	= ON_BOARD,
+	.pio_mask	= ATA_PIO4,
 };
 
 static int __devinit cs5530_init_one(struct pci_dev *dev, const struct pci_device_id *id)
diff --git a/drivers/ide/pci/cs5535.c b/drivers/ide/pci/cs5535.c
index 0bff4005292a..ce44e38390aa 100644
--- a/drivers/ide/pci/cs5535.c
+++ b/drivers/ide/pci/cs5535.c
@@ -231,6 +231,7 @@ static ide_pci_device_t cs5535_chipset __devinitdata = {
 	.autodma	= AUTODMA,
 	.bootable	= ON_BOARD,
 	.host_flags	= IDE_HFLAG_SINGLE,
+	.pio_mask	= ATA_PIO4,
 };
 
 static int __devinit cs5535_init_one(struct pci_dev *dev,
diff --git a/drivers/ide/pci/cy82c693.c b/drivers/ide/pci/cy82c693.c
index cb7c18187e3c..daa36fcbc8ef 100644
--- a/drivers/ide/pci/cy82c693.c
+++ b/drivers/ide/pci/cy82c693.c
@@ -486,6 +486,7 @@ static ide_pci_device_t cy82c693_chipset __devinitdata = {
 	.autodma	= AUTODMA,
 	.bootable	= ON_BOARD,
 	.host_flags	= IDE_HFLAG_SINGLE,
+	.pio_mask	= ATA_PIO4,
 };
 
 static int __devinit cy82c693_init_one(struct pci_dev *dev, const struct pci_device_id *id)
diff --git a/drivers/ide/pci/hpt34x.c b/drivers/ide/pci/hpt34x.c
index 2e4591303f4f..19778c5fe711 100644
--- a/drivers/ide/pci/hpt34x.c
+++ b/drivers/ide/pci/hpt34x.c
@@ -177,7 +177,8 @@ static ide_pci_device_t hpt34x_chipset __devinitdata = {
 	.init_hwif	= init_hwif_hpt34x,
 	.autodma	= NOAUTODMA,
 	.bootable	= NEVER_BOARD,
-	.extra		= 16
+	.extra		= 16,
+	.pio_mask	= ATA_PIO5,
 };
 
 static int __devinit hpt34x_init_one(struct pci_dev *dev, const struct pci_device_id *id)
diff --git a/drivers/ide/pci/hpt366.c b/drivers/ide/pci/hpt366.c
index 0a3fe9471eb6..2cd74c345a6c 100644
--- a/drivers/ide/pci/hpt366.c
+++ b/drivers/ide/pci/hpt366.c
@@ -1549,7 +1549,8 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
 		.bootable	= OFF_BOARD,
-		.extra		= 240
+		.extra		= 240,
+		.pio_mask	= ATA_PIO4,
 	},{	/* 1 */
 		.name		= "HPT372A",
 		.init_setup	= init_setup_hpt372a,
@@ -1560,7 +1561,8 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
 		.enablebits	= {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
 		.udma_mask	= HPT372_ALLOW_ATA133_6 ? 0x7f : 0x3f,
 		.bootable	= OFF_BOARD,
-		.extra		= 240
+		.extra		= 240,
+		.pio_mask	= ATA_PIO4,
 	},{	/* 2 */
 		.name		= "HPT302",
 		.init_setup	= init_setup_hpt302,
@@ -1571,7 +1573,8 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
 		.enablebits	= {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
 		.udma_mask	= HPT302_ALLOW_ATA133_6 ? 0x7f : 0x3f,
 		.bootable	= OFF_BOARD,
-		.extra		= 240
+		.extra		= 240,
+		.pio_mask	= ATA_PIO4,
 	},{	/* 3 */
 		.name		= "HPT371",
 		.init_setup	= init_setup_hpt371,
@@ -1582,7 +1585,8 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
 		.enablebits	= {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
 		.udma_mask	= HPT371_ALLOW_ATA133_6 ? 0x7f : 0x3f,
 		.bootable	= OFF_BOARD,
-		.extra		= 240
+		.extra		= 240,
+		.pio_mask	= ATA_PIO4,
 	},{	/* 4 */
 		.name		= "HPT374",
 		.init_setup	= init_setup_hpt374,
@@ -1593,7 +1597,8 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
 		.enablebits	= {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
 		.udma_mask	= 0x3f,
 		.bootable	= OFF_BOARD,
-		.extra		= 240
+		.extra		= 240,
+		.pio_mask	= ATA_PIO4,
 	},{	/* 5 */
 		.name		= "HPT372N",
 		.init_setup	= init_setup_hpt372n,
@@ -1604,7 +1609,8 @@ static ide_pci_device_t hpt366_chipsets[] __devinitdata = {
 		.enablebits	= {{0x50,0x04,0x04}, {0x54,0x04,0x04}},
 		.udma_mask	= HPT372_ALLOW_ATA133_6 ? 0x7f : 0x3f,
 		.bootable	= OFF_BOARD,
-		.extra		= 240
+		.extra		= 240,
+		.pio_mask	= ATA_PIO4,
 	}
 };
 
diff --git a/drivers/ide/pci/it8213.c b/drivers/ide/pci/it8213.c
index d8425a2499bf..95dbed7e6022 100644
--- a/drivers/ide/pci/it8213.c
+++ b/drivers/ide/pci/it8213.c
@@ -276,6 +276,7 @@ static void __devinit init_hwif_it8213(ide_hwif_t *hwif)
 		.enablebits	= {{0x41,0x80,0x80}}, \
 		.bootable	= ON_BOARD,		\
 		.host_flags	= IDE_HFLAG_SINGLE,	\
+		.pio_mask	= ATA_PIO4,		\
 	}
 
 static ide_pci_device_t it8213_chipsets[] __devinitdata = {
diff --git a/drivers/ide/pci/it821x.c b/drivers/ide/pci/it821x.c
index 790233db017b..9286c99e2ff0 100644
--- a/drivers/ide/pci/it821x.c
+++ b/drivers/ide/pci/it821x.c
@@ -720,7 +720,8 @@ static unsigned int __devinit init_chipset_it821x(struct pci_dev *dev, const cha
 		.init_hwif	= init_hwif_it821x,	\
 		.autodma	= AUTODMA,		\
 		.bootable	= ON_BOARD,		\
-		.fixup	 	= it821x_fixups		\
+		.fixup	 	= it821x_fixups,	\
+		.pio_mask	= ATA_PIO4,		\
 	}
 
 static ide_pci_device_t it821x_chipsets[] __devinitdata = {
diff --git a/drivers/ide/pci/jmicron.c b/drivers/ide/pci/jmicron.c
index 57d3d4186b3f..d7ce9dd8de16 100644
--- a/drivers/ide/pci/jmicron.c
+++ b/drivers/ide/pci/jmicron.c
@@ -180,6 +180,7 @@ fallback:
 		.autodma	= AUTODMA,		\
 		.bootable	= ON_BOARD,		\
 		.enablebits	= { {0x40, 1, 1}, {0x40, 0x10, 0x10} }, \
+		.pio_mask	= ATA_PIO5,		\
 	}
 
 static ide_pci_device_t jmicron_chipsets[] __devinitdata = {
diff --git a/drivers/ide/pci/opti621.c b/drivers/ide/pci/opti621.c
index 1802ad0927eb..3a2bb2723515 100644
--- a/drivers/ide/pci/opti621.c
+++ b/drivers/ide/pci/opti621.c
@@ -353,12 +353,14 @@ static ide_pci_device_t opti621_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x45,0x80,0x00}, {0x40,0x08,0x00}},
 		.bootable	= ON_BOARD,
+		.pio_mask	= ATA_PIO3,
 	},{	/* 1 */
 		.name		= "OPTI621X",
 		.init_hwif	= init_hwif_opti621,
 		.autodma	= AUTODMA,
 		.enablebits	= {{0x45,0x80,0x00}, {0x40,0x08,0x00}},
 		.bootable	= ON_BOARD,
+		.pio_mask	= ATA_PIO3,
 	}
 };
 
diff --git a/drivers/ide/pci/pdc202xx_new.c b/drivers/ide/pci/pdc202xx_new.c
index a8a01b4ab607..8a66a2871b3a 100644
--- a/drivers/ide/pci/pdc202xx_new.c
+++ b/drivers/ide/pci/pdc202xx_new.c
@@ -568,6 +568,7 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.init_hwif	= init_hwif_pdc202new,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
+		.pio_mask	= ATA_PIO4,
 		.udma_mask	= 0x3f, /* udma0-5 */
 	},{	/* 1 */
 		.name		= "PDC20269",
@@ -576,6 +577,7 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.init_hwif	= init_hwif_pdc202new,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
+		.pio_mask	= ATA_PIO4,
 		.udma_mask	= 0x7f, /* udma0-6*/
 	},{	/* 2 */
 		.name		= "PDC20270",
@@ -584,6 +586,7 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.init_hwif	= init_hwif_pdc202new,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
+		.pio_mask	= ATA_PIO4,
 		.udma_mask	= 0x3f, /* udma0-5 */
 	},{	/* 3 */
 		.name		= "PDC20271",
@@ -592,6 +595,7 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.init_hwif	= init_hwif_pdc202new,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
+		.pio_mask	= ATA_PIO4,
 		.udma_mask	= 0x7f, /* udma0-6*/
 	},{	/* 4 */
 		.name		= "PDC20275",
@@ -600,6 +604,7 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.init_hwif	= init_hwif_pdc202new,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
+		.pio_mask	= ATA_PIO4,
 		.udma_mask	= 0x7f, /* udma0-6*/
 	},{	/* 5 */
 		.name		= "PDC20276",
@@ -608,6 +613,7 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.init_hwif	= init_hwif_pdc202new,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
+		.pio_mask	= ATA_PIO4,
 		.udma_mask	= 0x7f, /* udma0-6*/
 	},{	/* 6 */
 		.name		= "PDC20277",
@@ -616,6 +622,7 @@ static ide_pci_device_t pdcnew_chipsets[] __devinitdata = {
 		.init_hwif	= init_hwif_pdc202new,
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
+		.pio_mask	= ATA_PIO4,
 		.udma_mask	= 0x7f, /* udma0-6*/
 	}
 };
diff --git a/drivers/ide/pci/pdc202xx_old.c b/drivers/ide/pci/pdc202xx_old.c
index 3a6882d4aa64..fbcb0bb9c956 100644
--- a/drivers/ide/pci/pdc202xx_old.c
+++ b/drivers/ide/pci/pdc202xx_old.c
@@ -444,6 +444,7 @@ static ide_pci_device_t pdc202xx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.extra		= 16,
+		.pio_mask	= ATA_PIO4,
 		.udma_mask	= 0x07, /* udma0-2 */
 	},{	/* 1 */
 		.name		= "PDC20262",
@@ -454,6 +455,7 @@ static ide_pci_device_t pdc202xx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.extra		= 48,
+		.pio_mask	= ATA_PIO4,
 		.udma_mask	= 0x1f, /* udma0-4 */
 	},{	/* 2 */
 		.name		= "PDC20263",
@@ -464,6 +466,7 @@ static ide_pci_device_t pdc202xx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.extra		= 48,
+		.pio_mask	= ATA_PIO4,
 		.udma_mask	= 0x1f, /* udma0-4 */
 	},{	/* 3 */
 		.name		= "PDC20265",
@@ -474,6 +477,7 @@ static ide_pci_device_t pdc202xx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.extra		= 48,
+		.pio_mask	= ATA_PIO4,
 		.udma_mask	= 0x3f, /* udma0-5 */
 	},{	/* 4 */
 		.name		= "PDC20267",
@@ -484,6 +488,7 @@ static ide_pci_device_t pdc202xx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.extra		= 48,
+		.pio_mask	= ATA_PIO4,
 		.udma_mask	= 0x3f, /* udma0-5 */
 	}
 };
diff --git a/drivers/ide/pci/piix.c b/drivers/ide/pci/piix.c
index 41a3612c6107..4f69cd067e5e 100644
--- a/drivers/ide/pci/piix.c
+++ b/drivers/ide/pci/piix.c
@@ -498,6 +498,7 @@ static void __devinit init_hwif_piix(ide_hwif_t *hwif)
 		.autodma	= AUTODMA,		\
 		.enablebits	= {{0x41,0x80,0x80}, {0x43,0x80,0x80}}, \
 		.bootable	= ON_BOARD,		\
+		.pio_mask	= ATA_PIO4,		\
 		.udma_mask	= udma,			\
 	}
 
@@ -517,6 +518,7 @@ static ide_pci_device_t piix_pci_info[] __devinitdata = {
 		.enablebits	= {{0x6d,0xc0,0x80}, {0x6d,0xc0,0xc0}},
 		.bootable	= ON_BOARD,
 		.host_flags	= IDE_HFLAG_ISA_PORTS,
+		.pio_mask	= ATA_PIO4,
 	},
 
 	/*  3 */ DECLARE_PIIX_DEV("PIIX3", 0x00),	/* no udma */
diff --git a/drivers/ide/pci/sc1200.c b/drivers/ide/pci/sc1200.c
index 98e1a2bd9501..9bdc9694d50d 100644
--- a/drivers/ide/pci/sc1200.c
+++ b/drivers/ide/pci/sc1200.c
@@ -438,6 +438,7 @@ static ide_pci_device_t sc1200_chipset __devinitdata = {
 	.init_hwif	= init_hwif_sc1200,
 	.autodma	= AUTODMA,
 	.bootable	= ON_BOARD,
+	.pio_mask	= ATA_PIO4,
 };
 
 static int __devinit sc1200_init_one(struct pci_dev *dev, const struct pci_device_id *id)
diff --git a/drivers/ide/pci/scc_pata.c b/drivers/ide/pci/scc_pata.c
index 2c76355b92e8..f668d235e6be 100644
--- a/drivers/ide/pci/scc_pata.c
+++ b/drivers/ide/pci/scc_pata.c
@@ -775,6 +775,7 @@ static void __devinit init_hwif_scc(ide_hwif_t *hwif)
       .autodma	= AUTODMA,				\
       .bootable	= ON_BOARD,				\
       .host_flags	= IDE_HFLAG_SINGLE,		\
+      .pio_mask		= ATA_PIO4,			\
   }
 
 static ide_pci_device_t scc_chipsets[] __devinitdata = {
diff --git a/drivers/ide/pci/serverworks.c b/drivers/ide/pci/serverworks.c
index 1fe29d9e68fa..9fead2e7d4c8 100644
--- a/drivers/ide/pci/serverworks.c
+++ b/drivers/ide/pci/serverworks.c
@@ -451,6 +451,7 @@ static ide_pci_device_t serverworks_chipsets[] __devinitdata = {
 		.init_hwif	= init_hwif_svwks,
 		.autodma	= AUTODMA,
 		.bootable	= ON_BOARD,
+		.pio_mask	= ATA_PIO4,
 	},{	/* 1 */
 		.name		= "SvrWks CSB5",
 		.init_setup	= init_setup_svwks,
@@ -458,6 +459,7 @@ static ide_pci_device_t serverworks_chipsets[] __devinitdata = {
 		.init_hwif	= init_hwif_svwks,
 		.autodma	= AUTODMA,
 		.bootable	= ON_BOARD,
+		.pio_mask	= ATA_PIO4,
 	},{	/* 2 */
 		.name		= "SvrWks CSB6",
 		.init_setup	= init_setup_csb6,
@@ -465,6 +467,7 @@ static ide_pci_device_t serverworks_chipsets[] __devinitdata = {
 		.init_hwif	= init_hwif_svwks,
 		.autodma	= AUTODMA,
 		.bootable	= ON_BOARD,
+		.pio_mask	= ATA_PIO4,
 	},{	/* 3 */
 		.name		= "SvrWks CSB6",
 		.init_setup	= init_setup_csb6,
@@ -473,6 +476,7 @@ static ide_pci_device_t serverworks_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.bootable	= ON_BOARD,
 		.host_flags	= IDE_HFLAG_SINGLE,
+		.pio_mask	= ATA_PIO4,
 	},{	/* 4 */
 		.name		= "SvrWks HT1000",
 		.init_setup	= init_setup_svwks,
@@ -481,6 +485,7 @@ static ide_pci_device_t serverworks_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.bootable	= ON_BOARD,
 		.host_flags	= IDE_HFLAG_SINGLE,
+		.pio_mask	= ATA_PIO4,
 	}
 };
 
diff --git a/drivers/ide/pci/sgiioc4.c b/drivers/ide/pci/sgiioc4.c
index ed983b56b4e2..57145767c3df 100644
--- a/drivers/ide/pci/sgiioc4.c
+++ b/drivers/ide/pci/sgiioc4.c
@@ -586,6 +586,7 @@ ide_init_sgiioc4(ide_hwif_t * hwif)
 	hwif->ultra_mask = 0x0;	/* Disable Ultra DMA */
 	hwif->mwdma_mask = 0x2;	/* Multimode-2 DMA  */
 	hwif->swdma_mask = 0x2;
+	hwif->pio_mask = 0x00;
 	hwif->tuneproc = NULL;	/* Sets timing for PIO mode */
 	hwif->speedproc = NULL;	/* Sets timing for DMA &/or PIO modes */
 	hwif->selectproc = NULL;/* Use the default routine to select drive */
diff --git a/drivers/ide/pci/siimage.c b/drivers/ide/pci/siimage.c
index 63a8a93d63d6..50f6d172ef77 100644
--- a/drivers/ide/pci/siimage.c
+++ b/drivers/ide/pci/siimage.c
@@ -959,6 +959,7 @@ static void __devinit init_hwif_siimage(ide_hwif_t *hwif)
 		.fixup		= siimage_fixup,	\
 		.autodma	= AUTODMA,		\
 		.bootable	= ON_BOARD,		\
+		.pio_mask	= ATA_PIO4,		\
 	}
 
 static ide_pci_device_t siimage_chipsets[] __devinitdata = {
diff --git a/drivers/ide/pci/sis5513.c b/drivers/ide/pci/sis5513.c
index 3c4693695ca3..63fbb79e8178 100644
--- a/drivers/ide/pci/sis5513.c
+++ b/drivers/ide/pci/sis5513.c
@@ -881,6 +881,7 @@ static ide_pci_device_t sis5513_chipset __devinitdata = {
 	.autodma	= NOAUTODMA,
 	.enablebits	= {{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
 	.bootable	= ON_BOARD,
+	.pio_mask	= ATA_PIO4,
 };
 
 static int __devinit sis5513_init_one(struct pci_dev *dev, const struct pci_device_id *id)
diff --git a/drivers/ide/pci/sl82c105.c b/drivers/ide/pci/sl82c105.c
index 4e8f32e643a6..0947cab00595 100644
--- a/drivers/ide/pci/sl82c105.c
+++ b/drivers/ide/pci/sl82c105.c
@@ -456,6 +456,7 @@ static ide_pci_device_t sl82c105_chipset __devinitdata = {
 	.autodma	= NOAUTODMA,
 	.enablebits	= {{0x40,0x01,0x01}, {0x40,0x10,0x10}},
 	.bootable	= ON_BOARD,
+	.pio_mask	= ATA_PIO5,
 };
 
 static int __devinit sl82c105_init_one(struct pci_dev *dev, const struct pci_device_id *id)
diff --git a/drivers/ide/pci/slc90e66.c b/drivers/ide/pci/slc90e66.c
index 562747fbee39..8e655f2db5cb 100644
--- a/drivers/ide/pci/slc90e66.c
+++ b/drivers/ide/pci/slc90e66.c
@@ -217,6 +217,7 @@ static ide_pci_device_t slc90e66_chipset __devinitdata = {
 	.autodma	= AUTODMA,
 	.enablebits	= {{0x41,0x80,0x80}, {0x43,0x80,0x80}},
 	.bootable	= ON_BOARD,
+	.pio_mask	= ATA_PIO4,
 };
 
 static int __devinit slc90e66_init_one(struct pci_dev *dev, const struct pci_device_id *id)
diff --git a/drivers/ide/pci/tc86c001.c b/drivers/ide/pci/tc86c001.c
index 2479a19f0094..ec79bacc30c2 100644
--- a/drivers/ide/pci/tc86c001.c
+++ b/drivers/ide/pci/tc86c001.c
@@ -251,6 +251,7 @@ static ide_pci_device_t tc86c001_chipset __devinitdata = {
 	.autodma	= AUTODMA,
 	.bootable	= OFF_BOARD,
 	.host_flags	= IDE_HFLAG_SINGLE,
+	.pio_mask	= ATA_PIO4,
 };
 
 static int __devinit tc86c001_init_one(struct pci_dev *dev,
diff --git a/drivers/ide/pci/triflex.c b/drivers/ide/pci/triflex.c
index f8fef905bacc..024bbfae0429 100644
--- a/drivers/ide/pci/triflex.c
+++ b/drivers/ide/pci/triflex.c
@@ -132,6 +132,7 @@ static ide_pci_device_t triflex_device __devinitdata = {
 	.autodma	= AUTODMA,
 	.enablebits	= {{0x80, 0x01, 0x01}, {0x80, 0x02, 0x02}},
 	.bootable	= ON_BOARD,
+	.pio_mask	= ATA_PIO4,
 };
 
 static int __devinit triflex_init_one(struct pci_dev *dev, 
diff --git a/drivers/ide/pci/via82cxxx.c b/drivers/ide/pci/via82cxxx.c
index b107ee3588f7..581316f9581d 100644
--- a/drivers/ide/pci/via82cxxx.c
+++ b/drivers/ide/pci/via82cxxx.c
@@ -503,6 +503,7 @@ static ide_pci_device_t via82cxxx_chipsets[] __devinitdata = {
 		.bootable	= ON_BOARD,
 		.host_flags	= IDE_HFLAG_PIO_NO_BLACKLIST
 				| IDE_HFLAG_PIO_NO_DOWNGRADE,
+		.pio_mask	= ATA_PIO5,
 	},{	/* 1 */
 		.name		= "VP_IDE",
 		.init_chipset	= init_chipset_via82cxxx,
@@ -512,6 +513,7 @@ static ide_pci_device_t via82cxxx_chipsets[] __devinitdata = {
 		.bootable	= ON_BOARD,
 		.host_flags	= IDE_HFLAG_PIO_NO_BLACKLIST
 				| IDE_HFLAG_PIO_NO_DOWNGRADE,
+		.pio_mask	= ATA_PIO5,
 	}
 };
 
diff --git a/drivers/ide/ppc/mpc8xx.c b/drivers/ide/ppc/mpc8xx.c
index f3789c0e6cc9..8859fe2f5ac2 100644
--- a/drivers/ide/ppc/mpc8xx.c
+++ b/drivers/ide/ppc/mpc8xx.c
@@ -316,6 +316,7 @@ m8xx_ide_init_hwif_ports(hw_regs_t *hw, unsigned long data_port,
 	}
 
 	/* register routine to tune PIO mode */
+	ide_hwifs[data_port].pio_mask = ATA_PIO4;
 	ide_hwifs[data_port].tuneproc = m8xx_ide_tuneproc;
 
 	hw->ack_intr = (ide_ack_intr_t *) ide_interrupt_ack;
@@ -402,6 +403,7 @@ void m8xx_ide_init_hwif_ports (hw_regs_t *hw,
 	}
 
 	/* register routine to tune PIO mode */
+	ide_hwifs[data_port].pio_mask = ATA_PIO4;
 	ide_hwifs[data_port].tuneproc = m8xx_ide_tuneproc;
 
 	hw->ack_intr = (ide_ack_intr_t *) ide_interrupt_ack;
diff --git a/drivers/ide/ppc/pmac.c b/drivers/ide/ppc/pmac.c
index debaa8823f51..33630ad3e794 100644
--- a/drivers/ide/ppc/pmac.c
+++ b/drivers/ide/ppc/pmac.c
@@ -1248,6 +1248,7 @@ pmac_ide_setup_device(pmac_ide_hwif_t *pmif, ide_hwif_t *hwif)
 	hwif->cbl = pmif->cable_80 ? ATA_CBL_PATA80 : ATA_CBL_PATA40;
 	hwif->drives[0].unmask = 1;
 	hwif->drives[1].unmask = 1;
+	hwif->pio_mask = ATA_PIO4;
 	hwif->tuneproc = pmac_ide_tuneproc;
 	if (pmif->kind == controller_un_ata6
 	    || pmif->kind == controller_k2_ata6
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index e9f3267456e2..f9e455cc8092 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -614,6 +614,7 @@ void ide_pci_setup_ports(struct pci_dev *dev, ide_pci_device_t *d, int pciirq, a
 			ide_hwif_setup_dma(dev, d, hwif);
 bypass_legacy_dma:
 		hwif->host_flags = d->host_flags;
+		hwif->pio_mask = d->pio_mask;
 
 		if (d->init_hwif)
 			/* Call chipset-specific routine
diff --git a/include/linux/ata.h b/include/linux/ata.h
index b5a20162af32..23a22df039d8 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -64,6 +64,15 @@ enum {
 	ATA_ID_PROD_LEN		= 40,
 
 	ATA_PCI_CTL_OFS		= 2,
+
+	ATA_PIO0		= (1 << 0),
+	ATA_PIO1		= ATA_PIO0 | (1 << 1),
+	ATA_PIO2		= ATA_PIO1 | (1 << 2),
+	ATA_PIO3		= ATA_PIO2 | (1 << 3),
+	ATA_PIO4		= ATA_PIO3 | (1 << 4),
+	ATA_PIO5		= ATA_PIO4 | (1 << 5),
+	ATA_PIO6		= ATA_PIO5 | (1 << 6),
+
 	ATA_UDMA0		= (1 << 0),
 	ATA_UDMA1		= ATA_UDMA0 | (1 << 1),
 	ATA_UDMA2		= ATA_UDMA1 | (1 << 2),
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 9f72f6e0c954..5f5daad8bc54 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -683,6 +683,8 @@ typedef struct hwif_s {
 
 	u8 host_flags;
 
+	u8 pio_mask;
+
 	u8 atapi_dma;	/* host supports atapi_dma */
 	u8 ultra_mask;
 	u8 mwdma_mask;
@@ -1270,6 +1272,7 @@ typedef struct ide_pci_device_s {
 	unsigned int		extra;
 	struct ide_pci_device_s	*next;
 	u8			host_flags;
+	u8			pio_mask;
 	u8			udma_mask;
 } ide_pci_device_t;
 
-- 
cgit v1.3-8-gc7d7


From 20c2df83d25c6a95affe6157a4c9cac4cf5ffaac Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 20 Jul 2007 10:11:58 +0900
Subject: mm: Remove slab destructors from kmem_cache_create().

Slab destructors were no longer supported after Christoph's
c59def9f222d44bb7e2f0a559f2906191a0862d7 change. They've been
BUGs for both slab and slub, and slob never supported them
either.

This rips out support for the dtor pointer from kmem_cache_create()
completely and fixes up every single callsite in the kernel (there were
about 224, not including the slab allocator definitions themselves,
or the documentation references).

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/arm/plat-s3c24xx/dma.c               |  2 +-
 arch/arm26/mm/memc.c                      |  4 ++--
 arch/i386/mm/init.c                       |  3 +--
 arch/ia64/ia32/ia32_support.c             |  2 +-
 arch/powerpc/kernel/rtas_flash.c          |  2 +-
 arch/powerpc/mm/hugetlbpage.c             |  2 +-
 arch/powerpc/mm/init_64.c                 |  3 +--
 arch/powerpc/platforms/cell/spufs/inode.c |  2 +-
 arch/sh/kernel/cpu/sh4/sq.c               |  3 +--
 arch/sh/mm/pmb.c                          |  2 +-
 arch/sparc64/mm/tsb.c                     |  3 +--
 block/bsg.c                               |  2 +-
 block/ll_rw_blk.c                         |  6 +++---
 drivers/acpi/osl.c                        |  2 +-
 drivers/block/aoe/aoeblk.c                |  4 ++--
 drivers/ieee1394/eth1394.c                |  2 +-
 drivers/infiniband/core/mad.c             |  1 -
 drivers/infiniband/hw/amso1100/c2_vq.c    |  2 +-
 drivers/infiniband/hw/ehca/ehca_av.c      |  2 +-
 drivers/infiniband/hw/ehca/ehca_cq.c      |  2 +-
 drivers/infiniband/hw/ehca/ehca_main.c    |  2 +-
 drivers/infiniband/hw/ehca/ehca_mrmw.c    |  4 ++--
 drivers/infiniband/hw/ehca/ehca_pd.c      |  2 +-
 drivers/infiniband/hw/ehca/ehca_qp.c      |  2 +-
 drivers/infiniband/ulp/iser/iscsi_iser.c  |  2 +-
 drivers/kvm/mmu.c                         |  8 ++++----
 drivers/md/raid5.c                        |  4 ++--
 drivers/message/i2o/i2o_block.c           |  3 +--
 drivers/mtd/ubi/eba.c                     |  2 +-
 drivers/mtd/ubi/wl.c                      |  2 +-
 drivers/s390/block/dasd_devmap.c          |  2 +-
 drivers/s390/scsi/zfcp_aux.c              |  6 +++---
 drivers/scsi/aic94xx/aic94xx_init.c       |  4 ++--
 drivers/scsi/libsas/sas_init.c            |  2 +-
 drivers/scsi/qla2xxx/qla_os.c             |  2 +-
 drivers/scsi/qla4xxx/ql4_os.c             |  2 +-
 drivers/scsi/scsi.c                       |  2 +-
 drivers/scsi/scsi_lib.c                   |  4 ++--
 drivers/scsi/scsi_tgt_lib.c               |  2 +-
 drivers/usb/host/uhci-hcd.c               |  2 +-
 drivers/usb/mon/mon_text.c                |  2 +-
 fs/adfs/super.c                           |  4 ++--
 fs/affs/super.c                           |  2 +-
 fs/afs/super.c                            |  3 +--
 fs/befs/linuxvfs.c                        |  4 ++--
 fs/bfs/inode.c                            |  4 ++--
 fs/bio.c                                  |  2 +-
 fs/block_dev.c                            |  2 +-
 fs/cifs/cifsfs.c                          | 10 +++++-----
 fs/coda/inode.c                           |  4 ++--
 fs/configfs/mount.c                       |  2 +-
 fs/dcache.c                               |  4 ++--
 fs/dcookies.c                             |  2 +-
 fs/dlm/lowcomms.c                         |  2 +-
 fs/dlm/memory.c                           |  2 +-
 fs/dnotify.c                              |  2 +-
 fs/dquot.c                                |  4 ++--
 fs/ecryptfs/main.c                        |  2 +-
 fs/efs/super.c                            |  4 ++--
 fs/eventpoll.c                            |  4 ++--
 fs/ext2/super.c                           |  4 ++--
 fs/ext3/super.c                           |  2 +-
 fs/ext4/super.c                           |  2 +-
 fs/fat/cache.c                            |  2 +-
 fs/fat/inode.c                            |  2 +-
 fs/fcntl.c                                |  2 +-
 fs/freevxfs/vxfs_super.c                  |  4 ++--
 fs/fuse/dev.c                             |  2 +-
 fs/fuse/inode.c                           |  2 +-
 fs/gfs2/main.c                            |  6 +++---
 fs/hfs/super.c                            |  2 +-
 fs/hfsplus/super.c                        |  2 +-
 fs/hpfs/super.c                           |  4 ++--
 fs/hugetlbfs/inode.c                      |  2 +-
 fs/inode.c                                |  3 +--
 fs/inotify_user.c                         |  4 ++--
 fs/isofs/inode.c                          |  2 +-
 fs/jbd/journal.c                          |  8 +++-----
 fs/jbd/revoke.c                           |  4 ++--
 fs/jbd2/journal.c                         |  8 +++-----
 fs/jbd2/revoke.c                          |  4 ++--
 fs/jffs2/malloc.c                         | 18 +++++++++---------
 fs/jffs2/super.c                          |  2 +-
 fs/jfs/jfs_metapage.c                     |  2 +-
 fs/jfs/super.c                            |  2 +-
 fs/locks.c                                |  2 +-
 fs/mbcache.c                              |  2 +-
 fs/minix/inode.c                          |  4 ++--
 fs/namespace.c                            |  2 +-
 fs/ncpfs/inode.c                          |  4 ++--
 fs/nfs/direct.c                           |  2 +-
 fs/nfs/inode.c                            |  4 ++--
 fs/nfs/pagelist.c                         |  2 +-
 fs/nfs/read.c                             |  2 +-
 fs/nfs/write.c                            |  2 +-
 fs/nfsd/nfs4state.c                       |  8 ++++----
 fs/ntfs/super.c                           | 10 +++++-----
 fs/ocfs2/dlm/dlmfs.c                      |  2 +-
 fs/ocfs2/dlm/dlmmaster.c                  |  2 +-
 fs/ocfs2/super.c                          |  2 +-
 fs/ocfs2/uptodate.c                       |  2 +-
 fs/openpromfs/inode.c                     |  2 +-
 fs/proc/inode.c                           |  4 ++--
 fs/qnx4/inode.c                           |  2 +-
 fs/reiserfs/super.c                       |  2 +-
 fs/romfs/inode.c                          |  4 ++--
 fs/smbfs/inode.c                          |  4 ++--
 fs/smbfs/request.c                        |  2 +-
 fs/sysfs/mount.c                          |  2 +-
 fs/sysv/inode.c                           |  2 +-
 fs/udf/super.c                            |  2 +-
 fs/ufs/super.c                            |  4 ++--
 fs/xfs/linux-2.6/kmem.h                   |  4 ++--
 include/linux/i2o.h                       |  3 +--
 include/linux/slab.h                      |  3 +--
 ipc/mqueue.c                              |  2 +-
 kernel/fork.c                             | 18 +++++++++---------
 kernel/nsproxy.c                          |  2 +-
 kernel/posix-timers.c                     |  2 +-
 kernel/user.c                             |  2 +-
 lib/idr.c                                 |  2 +-
 lib/radix-tree.c                          |  2 +-
 mm/mempolicy.c                            |  4 ++--
 mm/rmap.c                                 |  2 +-
 mm/shmem.c                                |  2 +-
 mm/slab.c                                 | 17 +++++++----------
 mm/slob.c                                 |  3 +--
 mm/slub.c                                 |  4 +---
 net/bridge/br_fdb.c                       |  2 +-
 net/core/flow.c                           |  2 +-
 net/core/neighbour.c                      |  2 +-
 net/core/skbuff.c                         |  4 ++--
 net/core/sock.c                           |  6 +++---
 net/dccp/ackvec.c                         |  4 ++--
 net/dccp/ccid.c                           |  2 +-
 net/dccp/ccids/lib/loss_interval.c        |  2 +-
 net/dccp/ccids/lib/packet_history.c       |  4 ++--
 net/dccp/proto.c                          |  2 +-
 net/decnet/dn_route.c                     |  2 +-
 net/decnet/dn_table.c                     |  2 +-
 net/ipv4/fib_hash.c                       |  4 ++--
 net/ipv4/fib_trie.c                       |  2 +-
 net/ipv4/inetpeer.c                       |  2 +-
 net/ipv4/ipmr.c                           |  2 +-
 net/ipv4/ipvs/ip_vs_conn.c                |  2 +-
 net/ipv4/route.c                          |  2 +-
 net/ipv4/tcp.c                            |  2 +-
 net/ipv6/ip6_fib.c                        |  2 +-
 net/ipv6/route.c                          |  2 +-
 net/ipv6/xfrm6_tunnel.c                   |  2 +-
 net/netfilter/nf_conntrack_core.c         |  2 +-
 net/netfilter/nf_conntrack_expect.c       |  2 +-
 net/netfilter/xt_hashlimit.c              |  2 +-
 net/rxrpc/af_rxrpc.c                      |  2 +-
 net/sctp/protocol.c                       |  4 ++--
 net/socket.c                              |  3 +--
 net/sunrpc/rpc_pipe.c                     |  2 +-
 net/sunrpc/sched.c                        |  4 ++--
 net/tipc/handler.c                        |  2 +-
 net/xfrm/xfrm_input.c                     |  2 +-
 net/xfrm/xfrm_policy.c                    |  2 +-
 security/keys/key.c                       |  2 +-
 security/selinux/avc.c                    |  2 +-
 security/selinux/hooks.c                  |  2 +-
 security/selinux/ss/avtab.c               |  2 +-
 165 files changed, 247 insertions(+), 268 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/plat-s3c24xx/dma.c b/arch/arm/plat-s3c24xx/dma.c
index 08d80f2f51f2..6d048490c559 100644
--- a/arch/arm/plat-s3c24xx/dma.c
+++ b/arch/arm/plat-s3c24xx/dma.c
@@ -1333,7 +1333,7 @@ int __init s3c24xx_dma_init(unsigned int channels, unsigned int irq,
 	dma_kmem = kmem_cache_create("dma_desc",
 				     sizeof(struct s3c2410_dma_buf), 0,
 				     SLAB_HWCACHE_ALIGN,
-				     s3c2410_dma_cache_ctor, NULL);
+				     s3c2410_dma_cache_ctor);
 
 	if (dma_kmem == NULL) {
 		printk(KERN_ERR "dma failed to make kmem cache\n");
diff --git a/arch/arm26/mm/memc.c b/arch/arm26/mm/memc.c
index 42505541a9b1..ffecd8578247 100644
--- a/arch/arm26/mm/memc.c
+++ b/arch/arm26/mm/memc.c
@@ -176,9 +176,9 @@ void __init pgtable_cache_init(void)
 {
 	pte_cache = kmem_cache_create("pte-cache",
 				sizeof(pte_t) * PTRS_PER_PTE,
-				0, SLAB_PANIC, pte_cache_ctor, NULL);
+				0, SLAB_PANIC, pte_cache_ctor);
 
 	pgd_cache = kmem_cache_create("pgd-cache", MEMC_TABLE_SIZE +
 				sizeof(pgd_t) * PTRS_PER_PGD,
-				0, SLAB_PANIC, pgd_cache_ctor, NULL);
+				0, SLAB_PANIC, pgd_cache_ctor);
 }
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 6a68b1ae061c..6e72f22e6bbd 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -752,8 +752,7 @@ void __init pgtable_cache_init(void)
 					PTRS_PER_PMD*sizeof(pmd_t),
 					PTRS_PER_PMD*sizeof(pmd_t),
 					SLAB_PANIC,
-					pmd_ctor,
-					NULL);
+					pmd_ctor);
 		if (!SHARED_KERNEL_PMD) {
 			/* If we're in PAE mode and have a non-shared
 			   kernel pmd, then the pgd size must be a
diff --git a/arch/ia64/ia32/ia32_support.c b/arch/ia64/ia32/ia32_support.c
index beea7a0b9dc6..e13a1a1db4b5 100644
--- a/arch/ia64/ia32/ia32_support.c
+++ b/arch/ia64/ia32/ia32_support.c
@@ -253,7 +253,7 @@ ia32_init (void)
 
 		partial_page_cachep = kmem_cache_create("partial_page_cache",
 						sizeof(struct partial_page),
-						0, SLAB_PANIC, NULL, NULL);
+						0, SLAB_PANIC, NULL);
 	}
 #endif
 	return 0;
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index f72118c0844f..62b7bf2f3eab 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -804,7 +804,7 @@ int __init rtas_flash_init(void)
 
 	flash_block_cache = kmem_cache_create("rtas_flash_cache",
 				RTAS_BLK_SIZE, RTAS_BLK_SIZE, 0,
-				rtas_block_ctor, NULL);
+				rtas_block_ctor);
 	if (!flash_block_cache) {
 		printk(KERN_ERR "%s: failed to create block cache\n",
 				__FUNCTION__);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 92a1b16fb7e3..4835f73af304 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -542,7 +542,7 @@ static int __init hugetlbpage_init(void)
 					       HUGEPTE_TABLE_SIZE,
 					       HUGEPTE_TABLE_SIZE,
 					       0,
-					       zero_ctor, NULL);
+					       zero_ctor);
 	if (! huge_pgtable_cache)
 		panic("hugetlbpage_init(): could not create hugepte cache\n");
 
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 1d6edf724c85..9f27bb56a61d 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -178,7 +178,6 @@ void pgtable_cache_init(void)
 		pgtable_cache[i] = kmem_cache_create(name,
 						     size, size,
 						     SLAB_PANIC,
-						     zero_ctor,
-						     NULL);
+						     zero_ctor);
 	}
 }
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index f37460e5bfd2..7eb4d6cbcb74 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -654,7 +654,7 @@ static int __init spufs_init(void)
 	ret = -ENOMEM;
 	spufs_inode_cache = kmem_cache_create("spufs_inode_cache",
 			sizeof(struct spufs_inode_info), 0,
-			SLAB_HWCACHE_ALIGN, spufs_init_once, NULL);
+			SLAB_HWCACHE_ALIGN, spufs_init_once);
 
 	if (!spufs_inode_cache)
 		goto out;
diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c
index d7fff752e569..b98d6c3e6f36 100644
--- a/arch/sh/kernel/cpu/sh4/sq.c
+++ b/arch/sh/kernel/cpu/sh4/sq.c
@@ -371,8 +371,7 @@ static int __init sq_api_init(void)
 	printk(KERN_NOTICE "sq: Registering store queue API.\n");
 
 	sq_cache = kmem_cache_create("store_queue_cache",
-				sizeof(struct sq_mapping), 0, 0,
-				NULL, NULL);
+				sizeof(struct sq_mapping), 0, 0, NULL);
 	if (unlikely(!sq_cache))
 		return ret;
 
diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c
index b6a5a338145b..a08a4a958add 100644
--- a/arch/sh/mm/pmb.c
+++ b/arch/sh/mm/pmb.c
@@ -310,7 +310,7 @@ static int __init pmb_init(void)
 	BUG_ON(unlikely(nr_entries >= NR_PMB_ENTRIES));
 
 	pmb_cache = kmem_cache_create("pmb", sizeof(struct pmb_entry), 0,
-				      SLAB_PANIC, pmb_cache_ctor, NULL);
+				      SLAB_PANIC, pmb_cache_ctor);
 
 	jump_to_P2();
 
diff --git a/arch/sparc64/mm/tsb.c b/arch/sparc64/mm/tsb.c
index 8eb8a7c76ec9..7ff0a02f5813 100644
--- a/arch/sparc64/mm/tsb.c
+++ b/arch/sparc64/mm/tsb.c
@@ -262,8 +262,7 @@ void __init pgtable_cache_init(void)
 
 		tsb_caches[i] = kmem_cache_create(name,
 						  size, size,
-						  0,
-						  NULL, NULL);
+						  0, NULL);
 		if (!tsb_caches[i]) {
 			prom_printf("Could not create %s cache\n", name);
 			prom_halt();
diff --git a/block/bsg.c b/block/bsg.c
index baa04e7adf19..f2992e72b841 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -1043,7 +1043,7 @@ static int __init bsg_init(void)
 	dev_t devid;
 
 	bsg_cmd_cachep = kmem_cache_create("bsg_cmd",
-				sizeof(struct bsg_command), 0, 0, NULL, NULL);
+				sizeof(struct bsg_command), 0, 0, NULL);
 	if (!bsg_cmd_cachep) {
 		printk(KERN_ERR "bsg: failed creating slab cache\n");
 		return -ENOMEM;
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index d7cadf304168..66056ca5e631 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3698,13 +3698,13 @@ int __init blk_dev_init(void)
 		panic("Failed to create kblockd\n");
 
 	request_cachep = kmem_cache_create("blkdev_requests",
-			sizeof(struct request), 0, SLAB_PANIC, NULL, NULL);
+			sizeof(struct request), 0, SLAB_PANIC, NULL);
 
 	requestq_cachep = kmem_cache_create("blkdev_queue",
-			sizeof(request_queue_t), 0, SLAB_PANIC, NULL, NULL);
+			sizeof(request_queue_t), 0, SLAB_PANIC, NULL);
 
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
-			sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
+			sizeof(struct io_context), 0, SLAB_PANIC, NULL);
 
 	for_each_possible_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 2e7ba615d760..00d53c2fd1e8 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -1098,7 +1098,7 @@ void acpi_os_release_lock(acpi_spinlock lockp, acpi_cpu_flags flags)
 acpi_status
 acpi_os_create_cache(char *name, u16 size, u16 depth, acpi_cache_t ** cache)
 {
-	*cache = kmem_cache_create(name, size, 0, 0, NULL, NULL);
+	*cache = kmem_cache_create(name, size, 0, 0, NULL);
 	if (*cache == NULL)
 		return AE_ERROR;
 	else
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 478489c568a4..4f598270fa31 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -257,9 +257,9 @@ aoeblk_exit(void)
 int __init
 aoeblk_init(void)
 {
-	buf_pool_cache = kmem_cache_create("aoe_bufs", 
+	buf_pool_cache = kmem_cache_create("aoe_bufs",
 					   sizeof(struct buf),
-					   0, 0, NULL, NULL);
+					   0, 0, NULL);
 	if (buf_pool_cache == NULL)
 		return -ENOMEM;
 
diff --git a/drivers/ieee1394/eth1394.c b/drivers/ieee1394/eth1394.c
index 93362eed94ed..3a9d7e2d4de6 100644
--- a/drivers/ieee1394/eth1394.c
+++ b/drivers/ieee1394/eth1394.c
@@ -1729,7 +1729,7 @@ static int __init ether1394_init_module(void)
 
 	packet_task_cache = kmem_cache_create("packet_task",
 					      sizeof(struct packet_task),
-					      0, 0, NULL, NULL);
+					      0, 0, NULL);
 	if (!packet_task_cache)
 		return -ENOMEM;
 
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 6b8faca02f8a..bc547f1d34ba 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -2998,7 +2998,6 @@ static int __init ib_mad_init_module(void)
 					 sizeof(struct ib_mad_private),
 					 0,
 					 SLAB_HWCACHE_ALIGN,
-					 NULL,
 					 NULL);
 	if (!ib_mad_cache) {
 		printk(KERN_ERR PFX "Couldn't create ib_mad cache\n");
diff --git a/drivers/infiniband/hw/amso1100/c2_vq.c b/drivers/infiniband/hw/amso1100/c2_vq.c
index 36620a22413c..cfdacb1ec279 100644
--- a/drivers/infiniband/hw/amso1100/c2_vq.c
+++ b/drivers/infiniband/hw/amso1100/c2_vq.c
@@ -85,7 +85,7 @@ int vq_init(struct c2_dev *c2dev)
 		(char) ('0' + c2dev->devnum));
 	c2dev->host_msg_cache =
 	    kmem_cache_create(c2dev->vq_cache_name, c2dev->rep_vq.msg_size, 0,
-			      SLAB_HWCACHE_ALIGN, NULL, NULL);
+			      SLAB_HWCACHE_ALIGN, NULL);
 	if (c2dev->host_msg_cache == NULL) {
 		return -ENOMEM;
 	}
diff --git a/drivers/infiniband/hw/ehca/ehca_av.c b/drivers/infiniband/hw/ehca/ehca_av.c
index e53a97af1260..97d108634c58 100644
--- a/drivers/infiniband/hw/ehca/ehca_av.c
+++ b/drivers/infiniband/hw/ehca/ehca_av.c
@@ -259,7 +259,7 @@ int ehca_init_av_cache(void)
 	av_cache = kmem_cache_create("ehca_cache_av",
 				   sizeof(struct ehca_av), 0,
 				   SLAB_HWCACHE_ALIGN,
-				   NULL, NULL);
+				   NULL);
 	if (!av_cache)
 		return -ENOMEM;
 	return 0;
diff --git a/drivers/infiniband/hw/ehca/ehca_cq.c b/drivers/infiniband/hw/ehca/ehca_cq.c
index 9e87883b561a..1e8ca3fca4aa 100644
--- a/drivers/infiniband/hw/ehca/ehca_cq.c
+++ b/drivers/infiniband/hw/ehca/ehca_cq.c
@@ -387,7 +387,7 @@ int ehca_init_cq_cache(void)
 	cq_cache = kmem_cache_create("ehca_cache_cq",
 				     sizeof(struct ehca_cq), 0,
 				     SLAB_HWCACHE_ALIGN,
-				     NULL, NULL);
+				     NULL);
 	if (!cq_cache)
 		return -ENOMEM;
 	return 0;
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index 36377c6db3d4..04c324330b7c 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -163,7 +163,7 @@ static int ehca_create_slab_caches(void)
 	ctblk_cache = kmem_cache_create("ehca_cache_ctblk",
 					EHCA_PAGESIZE, H_CB_ALIGNMENT,
 					SLAB_HWCACHE_ALIGN,
-					NULL, NULL);
+					NULL);
 	if (!ctblk_cache) {
 		ehca_gen_err("Cannot create ctblk SLAB cache.");
 		ehca_cleanup_mrmw_cache();
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c
index 6262c5462d50..9f4c9d46e8ef 100644
--- a/drivers/infiniband/hw/ehca/ehca_mrmw.c
+++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c
@@ -1950,13 +1950,13 @@ int ehca_init_mrmw_cache(void)
 	mr_cache = kmem_cache_create("ehca_cache_mr",
 				     sizeof(struct ehca_mr), 0,
 				     SLAB_HWCACHE_ALIGN,
-				     NULL, NULL);
+				     NULL);
 	if (!mr_cache)
 		return -ENOMEM;
 	mw_cache = kmem_cache_create("ehca_cache_mw",
 				     sizeof(struct ehca_mw), 0,
 				     SLAB_HWCACHE_ALIGN,
-				     NULL, NULL);
+				     NULL);
 	if (!mw_cache) {
 		kmem_cache_destroy(mr_cache);
 		mr_cache = NULL;
diff --git a/drivers/infiniband/hw/ehca/ehca_pd.c b/drivers/infiniband/hw/ehca/ehca_pd.c
index 79d0591a8043..c85312ad292b 100644
--- a/drivers/infiniband/hw/ehca/ehca_pd.c
+++ b/drivers/infiniband/hw/ehca/ehca_pd.c
@@ -100,7 +100,7 @@ int ehca_init_pd_cache(void)
 	pd_cache = kmem_cache_create("ehca_cache_pd",
 				     sizeof(struct ehca_pd), 0,
 				     SLAB_HWCACHE_ALIGN,
-				     NULL, NULL);
+				     NULL);
 	if (!pd_cache)
 		return -ENOMEM;
 	return 0;
diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
index 48e9ceacd6fa..a3146e696c5d 100644
--- a/drivers/infiniband/hw/ehca/ehca_qp.c
+++ b/drivers/infiniband/hw/ehca/ehca_qp.c
@@ -1760,7 +1760,7 @@ int ehca_init_qp_cache(void)
 	qp_cache = kmem_cache_create("ehca_cache_qp",
 				     sizeof(struct ehca_qp), 0,
 				     SLAB_HWCACHE_ALIGN,
-				     NULL, NULL);
+				     NULL);
 	if (!qp_cache)
 		return -ENOMEM;
 	return 0;
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index effdee299b0c..5db314380271 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -637,7 +637,7 @@ static int __init iser_init(void)
 	ig.desc_cache = kmem_cache_create("iser_descriptors",
 					  sizeof (struct iser_desc),
 					  0, SLAB_HWCACHE_ALIGN,
-					  NULL, NULL);
+					  NULL);
 	if (ig.desc_cache == NULL)
 		return -ENOMEM;
 
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index b297a6b111ac..1199d3f32ac3 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -1332,24 +1332,24 @@ int kvm_mmu_module_init(void)
 {
 	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
 					    sizeof(struct kvm_pte_chain),
-					    0, 0, NULL, NULL);
+					    0, 0, NULL);
 	if (!pte_chain_cache)
 		goto nomem;
 	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
 					    sizeof(struct kvm_rmap_desc),
-					    0, 0, NULL, NULL);
+					    0, 0, NULL);
 	if (!rmap_desc_cache)
 		goto nomem;
 
 	mmu_page_cache = kmem_cache_create("kvm_mmu_page",
 					   PAGE_SIZE,
-					   PAGE_SIZE, 0, NULL, NULL);
+					   PAGE_SIZE, 0, NULL);
 	if (!mmu_page_cache)
 		goto nomem;
 
 	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
 						  sizeof(struct kvm_mmu_page),
-						  0, 0, NULL, NULL);
+						  0, 0, NULL);
 	if (!mmu_page_header_cache)
 		goto nomem;
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0b66afef2d82..c8dfdb302916 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -951,7 +951,7 @@ static int grow_stripes(raid5_conf_t *conf, int num)
 	conf->active_name = 0;
 	sc = kmem_cache_create(conf->cache_name[conf->active_name],
 			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
-			       0, 0, NULL, NULL);
+			       0, 0, NULL);
 	if (!sc)
 		return 1;
 	conf->slab_cache = sc;
@@ -1003,7 +1003,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 	/* Step 1 */
 	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
 			       sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
-			       0, 0, NULL, NULL);
+			       0, 0, NULL);
 	if (!sc)
 		return -ENOMEM;
 
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 64a52bd7544a..988c8ce47f58 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -1171,8 +1171,7 @@ static int __init i2o_block_init(void)
 	/* Allocate request mempool and slab */
 	size = sizeof(struct i2o_block_request);
 	i2o_blk_req_pool.slab = kmem_cache_create("i2o_block_req", size, 0,
-						  SLAB_HWCACHE_ALIGN, NULL,
-						  NULL);
+						  SLAB_HWCACHE_ALIGN, NULL);
 	if (!i2o_blk_req_pool.slab) {
 		osm_err("can't init request slab\n");
 		rc = -ENOMEM;
diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c
index 8aff9385613f..7c5e29eaf118 100644
--- a/drivers/mtd/ubi/eba.c
+++ b/drivers/mtd/ubi/eba.c
@@ -1149,7 +1149,7 @@ int ubi_eba_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si)
 	if (ubi_devices_cnt == 0) {
 		ltree_slab = kmem_cache_create("ubi_ltree_slab",
 					       sizeof(struct ltree_entry), 0,
-					       0, &ltree_entry_ctor, NULL);
+					       0, &ltree_entry_ctor);
 		if (!ltree_slab)
 			return -ENOMEM;
 	}
diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c
index 9de953762097..a5a9b8d87302 100644
--- a/drivers/mtd/ubi/wl.c
+++ b/drivers/mtd/ubi/wl.c
@@ -1452,7 +1452,7 @@ int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si)
 	if (ubi_devices_cnt == 0) {
 		wl_entries_slab = kmem_cache_create("ubi_wl_entry_slab",
 						    sizeof(struct ubi_wl_entry),
-						    0, 0, NULL, NULL);
+						    0, 0, NULL);
 		if (!wl_entries_slab)
 			return -ENOMEM;
 	}
diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c
index 6a89cefe99bb..0c67258fb9ec 100644
--- a/drivers/s390/block/dasd_devmap.c
+++ b/drivers/s390/block/dasd_devmap.c
@@ -291,7 +291,7 @@ dasd_parse_keyword( char *parsestring ) {
 		dasd_page_cache =
 			kmem_cache_create("dasd_page_cache", PAGE_SIZE,
 					  PAGE_SIZE, SLAB_CACHE_DMA,
-					  NULL, NULL );
+					  NULL);
 		if (!dasd_page_cache)
 			MESSAGE(KERN_WARNING, "%s", "Failed to create slab, "
 				"fixed buffer mode disabled.");
diff --git a/drivers/s390/scsi/zfcp_aux.c b/drivers/s390/scsi/zfcp_aux.c
index a1db95925138..9726261c367d 100644
--- a/drivers/s390/scsi/zfcp_aux.c
+++ b/drivers/s390/scsi/zfcp_aux.c
@@ -259,21 +259,21 @@ zfcp_module_init(void)
 	size = sizeof(struct zfcp_fsf_req_qtcb);
 	align = calc_alignment(size);
 	zfcp_data.fsf_req_qtcb_cache =
-		kmem_cache_create("zfcp_fsf", size, align, 0, NULL, NULL);
+		kmem_cache_create("zfcp_fsf", size, align, 0, NULL);
 	if (!zfcp_data.fsf_req_qtcb_cache)
 		goto out;
 
 	size = sizeof(struct fsf_status_read_buffer);
 	align = calc_alignment(size);
 	zfcp_data.sr_buffer_cache =
-		kmem_cache_create("zfcp_sr", size, align, 0, NULL, NULL);
+		kmem_cache_create("zfcp_sr", size, align, 0, NULL);
 	if (!zfcp_data.sr_buffer_cache)
 		goto out_sr_cache;
 
 	size = sizeof(struct zfcp_gid_pn_data);
 	align = calc_alignment(size);
 	zfcp_data.gid_pn_cache =
-		kmem_cache_create("zfcp_gid", size, align, 0, NULL, NULL);
+		kmem_cache_create("zfcp_gid", size, align, 0, NULL);
 	if (!zfcp_data.gid_pn_cache)
 		goto out_gid_cache;
 
diff --git a/drivers/scsi/aic94xx/aic94xx_init.c b/drivers/scsi/aic94xx/aic94xx_init.c
index 1c0d7578e791..b8c6810090d5 100644
--- a/drivers/scsi/aic94xx/aic94xx_init.c
+++ b/drivers/scsi/aic94xx/aic94xx_init.c
@@ -462,7 +462,7 @@ static int asd_create_global_caches(void)
 					    sizeof(struct asd_dma_tok),
 					    0,
 					    SLAB_HWCACHE_ALIGN,
-					    NULL, NULL);
+					    NULL);
 		if (!asd_dma_token_cache) {
 			asd_printk("couldn't create dma token cache\n");
 			return -ENOMEM;
@@ -474,7 +474,7 @@ static int asd_create_global_caches(void)
 						   sizeof(struct asd_ascb),
 						   0,
 						   SLAB_HWCACHE_ALIGN,
-						   NULL, NULL);
+						   NULL);
 		if (!asd_ascb_cache) {
 			asd_printk("couldn't create ascb cache\n");
 			goto Err;
diff --git a/drivers/scsi/libsas/sas_init.c b/drivers/scsi/libsas/sas_init.c
index 965698c8b7bf..1396c83b0c9c 100644
--- a/drivers/scsi/libsas/sas_init.c
+++ b/drivers/scsi/libsas/sas_init.c
@@ -292,7 +292,7 @@ EXPORT_SYMBOL_GPL(sas_domain_release_transport);
 static int __init sas_class_init(void)
 {
 	sas_task_cache = kmem_cache_create("sas_task", sizeof(struct sas_task),
-					   0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+					   0, SLAB_HWCACHE_ALIGN, NULL);
 	if (!sas_task_cache)
 		return -ENOMEM;
 
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index b5a77b0c0deb..92376f9dfdd5 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -2723,7 +2723,7 @@ qla2x00_module_init(void)
 
 	/* Allocate cache for SRBs. */
 	srb_cachep = kmem_cache_create("qla2xxx_srbs", sizeof(srb_t), 0,
-	    SLAB_HWCACHE_ALIGN, NULL, NULL);
+	    SLAB_HWCACHE_ALIGN, NULL);
 	if (srb_cachep == NULL) {
 		printk(KERN_ERR
 		    "qla2xxx: Unable to allocate SRB cache...Failing load!\n");
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index e69160a7bc60..b1d565c12c5b 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -1677,7 +1677,7 @@ static int __init qla4xxx_module_init(void)
 
 	/* Allocate cache for SRBs. */
 	srb_cachep = kmem_cache_create("qla4xxx_srbs", sizeof(struct srb), 0,
-				       SLAB_HWCACHE_ALIGN, NULL, NULL);
+				       SLAB_HWCACHE_ALIGN, NULL);
 	if (srb_cachep == NULL) {
 		printk(KERN_ERR
 		       "%s: Unable to allocate SRB cache..."
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index a691dda40d2c..a5de1a829a76 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -288,7 +288,7 @@ int scsi_setup_command_freelist(struct Scsi_Host *shost)
 	if (!pool->users) {
 		pool->slab = kmem_cache_create(pool->name,
 				sizeof(struct scsi_cmnd), 0,
-				pool->slab_flags, NULL, NULL);
+				pool->slab_flags, NULL);
 		if (!pool->slab)
 			goto fail;
 	}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 1f5a07bf2a75..da63c544919b 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1661,7 +1661,7 @@ int __init scsi_init_queue(void)
 
 	scsi_io_context_cache = kmem_cache_create("scsi_io_context",
 					sizeof(struct scsi_io_context),
-					0, 0, NULL, NULL);
+					0, 0, NULL);
 	if (!scsi_io_context_cache) {
 		printk(KERN_ERR "SCSI: can't init scsi io context cache\n");
 		return -ENOMEM;
@@ -1672,7 +1672,7 @@ int __init scsi_init_queue(void)
 		int size = sgp->size * sizeof(struct scatterlist);
 
 		sgp->slab = kmem_cache_create(sgp->name, size, 0,
-				SLAB_HWCACHE_ALIGN, NULL, NULL);
+				SLAB_HWCACHE_ALIGN, NULL);
 		if (!sgp->slab) {
 			printk(KERN_ERR "SCSI: can't init sg slab %s\n",
 					sgp->name);
diff --git a/drivers/scsi/scsi_tgt_lib.c b/drivers/scsi/scsi_tgt_lib.c
index 2570f48a69c7..371b69c110bc 100644
--- a/drivers/scsi/scsi_tgt_lib.c
+++ b/drivers/scsi/scsi_tgt_lib.c
@@ -585,7 +585,7 @@ static int __init scsi_tgt_init(void)
 
 	scsi_tgt_cmd_cache = kmem_cache_create("scsi_tgt_cmd",
 					       sizeof(struct scsi_tgt_cmd),
-					       0, 0, NULL, NULL);
+					       0, 0, NULL);
 	if (!scsi_tgt_cmd_cache)
 		return -ENOMEM;
 
diff --git a/drivers/usb/host/uhci-hcd.c b/drivers/usb/host/uhci-hcd.c
index 76c555a67dac..805e5fc5f5db 100644
--- a/drivers/usb/host/uhci-hcd.c
+++ b/drivers/usb/host/uhci-hcd.c
@@ -933,7 +933,7 @@ static int __init uhci_hcd_init(void)
 	}
 
 	uhci_up_cachep = kmem_cache_create("uhci_urb_priv",
-		sizeof(struct urb_priv), 0, 0, NULL, NULL);
+		sizeof(struct urb_priv), 0, 0, NULL);
 	if (!uhci_up_cachep)
 		goto up_failed;
 
diff --git a/drivers/usb/mon/mon_text.c b/drivers/usb/mon/mon_text.c
index 982b773d71e6..8f27a9e1c36b 100644
--- a/drivers/usb/mon/mon_text.c
+++ b/drivers/usb/mon/mon_text.c
@@ -340,7 +340,7 @@ static int mon_text_open(struct inode *inode, struct file *file)
 	snprintf(rp->slab_name, SLAB_NAME_SZ, "mon_text_%p", rp);
 	rp->e_slab = kmem_cache_create(rp->slab_name,
 	    sizeof(struct mon_event_text), sizeof(long), 0,
-	    mon_text_ctor, NULL);
+	    mon_text_ctor);
 	if (rp->e_slab == NULL) {
 		rc = -ENOMEM;
 		goto err_slab;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index de2ed5ca3351..1c9fd3029496 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -234,14 +234,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
 					     sizeof(struct adfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (adfs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 6d0ebc321530..c80191ae2059 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -99,7 +99,7 @@ static int init_inodecache(void)
 					     sizeof(struct affs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (affs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 993cdf1cce3a..b8808b40f82b 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -89,8 +89,7 @@ int __init afs_fs_init(void)
 					     sizeof(struct afs_vnode),
 					     0,
 					     SLAB_HWCACHE_ALIGN,
-					     afs_i_init_once,
-					     NULL);
+					     afs_i_init_once);
 	if (!afs_inode_cachep) {
 		printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n");
 		return ret;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index a5c5171c2828..a45141827681 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -414,7 +414,7 @@ befs_read_inode(struct inode *inode)
 }
 
 /* Initialize the inode cache. Called at fs setup.
- * 
+ *
  * Taken from NFS implementation by Al Viro.
  */
 static int
@@ -424,7 +424,7 @@ befs_init_inodecache(void)
 					      sizeof (struct befs_inode_info),
 					      0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					      init_once, NULL);
+					      init_once);
 	if (befs_inode_cachep == NULL) {
 		printk(KERN_ERR "befs_init_inodecache: "
 		       "Couldn't initalize inode slabcache\n");
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 58c7bd9f5301..f346eb14e86f 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -250,14 +250,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 
 	inode_init_once(&bi->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
 					     sizeof(struct bfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (bfs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/bio.c b/fs/bio.c
index 33e46340a766..0d2c2d38b7ba 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1187,7 +1187,7 @@ static void __init biovec_init_slabs(void)
 
 		size = bvs->nr_vecs * sizeof(struct bio_vec);
 		bvs->slab = kmem_cache_create(bvs->name, size, 0,
-                                SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+                                SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 	}
 }
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3635315e3b99..2980eabe5779 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -517,7 +517,7 @@ void __init bdev_cache_init(void)
 	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
 			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 				SLAB_MEM_SPREAD|SLAB_PANIC),
-			init_once, NULL);
+			init_once);
 	err = register_filesystem(&bd_type);
 	if (err)
 		panic("Cannot register bdev pseudo-fs");
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 1fd0dc85f53c..cabb6a55d7dd 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -719,7 +719,7 @@ cifs_init_inodecache(void)
 					      sizeof (struct cifsInodeInfo),
 					      0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					      cifs_init_once, NULL);
+					      cifs_init_once);
 	if (cifs_inode_cachep == NULL)
 		return -ENOMEM;
 
@@ -748,7 +748,7 @@ cifs_init_request_bufs(void)
 	cifs_req_cachep = kmem_cache_create("cifs_request",
 					    CIFSMaxBufSize +
 					    MAX_CIFS_HDR_SIZE, 0,
-					    SLAB_HWCACHE_ALIGN, NULL, NULL);
+					    SLAB_HWCACHE_ALIGN, NULL);
 	if (cifs_req_cachep == NULL)
 		return -ENOMEM;
 
@@ -776,7 +776,7 @@ cifs_init_request_bufs(void)
 	alloc of large cifs buffers even when page debugging is on */
 	cifs_sm_req_cachep = kmem_cache_create("cifs_small_rq",
 			MAX_CIFS_SMALL_BUFFER_SIZE, 0, SLAB_HWCACHE_ALIGN,
-			NULL, NULL);
+			NULL);
 	if (cifs_sm_req_cachep == NULL) {
 		mempool_destroy(cifs_req_poolp);
 		kmem_cache_destroy(cifs_req_cachep);
@@ -817,7 +817,7 @@ cifs_init_mids(void)
 {
 	cifs_mid_cachep = kmem_cache_create("cifs_mpx_ids",
 				sizeof (struct mid_q_entry), 0,
-				SLAB_HWCACHE_ALIGN, NULL, NULL);
+				SLAB_HWCACHE_ALIGN, NULL);
 	if (cifs_mid_cachep == NULL)
 		return -ENOMEM;
 
@@ -830,7 +830,7 @@ cifs_init_mids(void)
 
 	cifs_oplock_cachep = kmem_cache_create("cifs_oplock_structs",
 				sizeof (struct oplock_q_entry), 0,
-				SLAB_HWCACHE_ALIGN, NULL, NULL);
+				SLAB_HWCACHE_ALIGN, NULL);
 	if (cifs_oplock_cachep == NULL) {
 		mempool_destroy(cifs_mid_poolp);
 		kmem_cache_destroy(cifs_mid_cachep);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6771a4271e33..342f4e0d582e 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -64,13 +64,13 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 int coda_init_inodecache(void)
 {
 	coda_inode_cachep = kmem_cache_create("coda_inode_cache",
 				sizeof(struct coda_inode_info),
 				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-				init_once, NULL);
+				init_once);
 	if (coda_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index b00d962de833..871b0cb61839 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -136,7 +136,7 @@ static int __init configfs_init(void)
 
 	configfs_dir_cachep = kmem_cache_create("configfs_dir_cache",
 						sizeof(struct configfs_dirent),
-						0, 0, NULL, NULL);
+						0, 0, NULL);
 	if (!configfs_dir_cachep)
 		goto out;
 
diff --git a/fs/dcache.c b/fs/dcache.c
index cb9d05056b54..678d39deb607 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2165,10 +2165,10 @@ void __init vfs_caches_init(unsigned long mempages)
 	mempages -= reserve;
 
 	names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	dcache_init(mempages);
 	inode_init(mempages);
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 21af1629f9bc..c1208f53bd74 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -205,7 +205,7 @@ static int dcookie_init(void)
 
 	dcookie_cache = kmem_cache_create("dcookie_cache",
 		sizeof(struct dcookie_struct),
-		0, 0, NULL, NULL);
+		0, 0, NULL);
 
 	if (!dcookie_cache)
 		goto out;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 0553a6158dcb..dd362739d291 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1449,7 +1449,7 @@ int dlm_lowcomms_start(void)
 	error = -ENOMEM;
 	con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
 				      __alignof__(struct connection), 0,
-				      NULL, NULL);
+				      NULL);
 	if (!con_cache)
 		goto out;
 
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index fb9e2ee998ae..ecf0e5cb2035 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -23,7 +23,7 @@ int dlm_memory_init(void)
 	int ret = 0;
 
 	lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
-				__alignof__(struct dlm_lkb), 0, NULL, NULL);
+				__alignof__(struct dlm_lkb), 0, NULL);
 	if (!lkb_cache)
 		ret = -ENOMEM;
 	return ret;
diff --git a/fs/dnotify.c b/fs/dnotify.c
index 936409fcd939..28d01ed66de0 100644
--- a/fs/dnotify.c
+++ b/fs/dnotify.c
@@ -176,7 +176,7 @@ EXPORT_SYMBOL_GPL(dnotify_parent);
 static int __init dnotify_init(void)
 {
 	dn_cache = kmem_cache_create("dnotify_cache",
-		sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL, NULL);
+		sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL);
 	return 0;
 }
 
diff --git a/fs/dquot.c b/fs/dquot.c
index 7e273151f589..de9a29f64ff3 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -1848,11 +1848,11 @@ static int __init dquot_init(void)
 
 	register_sysctl_table(sys_table);
 
-	dquot_cachep = kmem_cache_create("dquot", 
+	dquot_cachep = kmem_cache_create("dquot",
 			sizeof(struct dquot), sizeof(unsigned long) * 4,
 			(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 				SLAB_MEM_SPREAD|SLAB_PANIC),
-			NULL, NULL);
+			NULL);
 
 	order = 0;
 	dquot_hash = (struct hlist_head *)__get_free_pages(GFP_ATOMIC, order);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 02ca6f1e55d7..e557a6766927 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -677,7 +677,7 @@ static int ecryptfs_init_kmem_caches(void)
 
 		info = &ecryptfs_cache_infos[i];
 		*(info->cache) = kmem_cache_create(info->name, info->size,
-				0, SLAB_HWCACHE_ALIGN, info->ctor, NULL);
+				0, SLAB_HWCACHE_ALIGN, info->ctor);
 		if (!*(info->cache)) {
 			ecryptfs_free_kmem_caches();
 			ecryptfs_printk(KERN_WARNING, "%s: "
diff --git a/fs/efs/super.c b/fs/efs/super.c
index d360c81f3a72..ce4acb8ff819 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -75,13 +75,13 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	efs_inode_cachep = kmem_cache_create("efs_inode_cache",
 				sizeof(struct efs_inode_info),
 				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-				init_once, NULL);
+				init_once);
 	if (efs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 0b73cd45a06d..77b9953624f4 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1324,12 +1324,12 @@ static int __init eventpoll_init(void)
 	/* Allocates slab cache used to allocate "struct epitem" items */
 	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
 			0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC,
-			NULL, NULL);
+			NULL);
 
 	/* Allocates slab cache used to allocate "struct eppoll_entry" */
 	pwq_cache = kmem_cache_create("eventpoll_pwq",
 			sizeof(struct eppoll_entry), 0,
-			EPI_SLAB_DEBUG|SLAB_PANIC, NULL, NULL);
+			EPI_SLAB_DEBUG|SLAB_PANIC, NULL);
 
 	return 0;
 }
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index a6b1072daea7..68579a0ed3f0 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -167,14 +167,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 #endif
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
 					     sizeof(struct ext2_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (ext2_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 4f84dc86628a..f0614e3f1fe8 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -490,7 +490,7 @@ static int init_inodecache(void)
 					     sizeof(struct ext3_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (ext3_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6dcbb28dc06d..75adbb64e028 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -541,7 +541,7 @@ static int init_inodecache(void)
 					     sizeof(struct ext4_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (ext4_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 3c9c8a15ec73..be6f89b152ca 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -48,7 +48,7 @@ int __init fat_cache_init(void)
 	fat_cache_cachep = kmem_cache_create("fat_cache",
 				sizeof(struct fat_cache),
 				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-				init_once, NULL);
+				init_once);
 	if (fat_cache_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0a7ddb39a593..4baa5f205368 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -514,7 +514,7 @@ static int __init fat_init_inodecache(void)
 					     sizeof(struct msdos_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (fat_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 3f22e9f4f691..78b2ff044054 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -638,7 +638,7 @@ EXPORT_SYMBOL(kill_fasync);
 static int __init fasync_init(void)
 {
 	fasync_cache = kmem_cache_create("fasync_cache",
-		sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL, NULL);
+		sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
 	return 0;
 }
 
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 647d600f0bc8..4f95572d2722 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -263,8 +263,8 @@ vxfs_init(void)
 	int rv;
 
 	vxfs_inode_cachep = kmem_cache_create("vxfs_inode",
-			sizeof(struct vxfs_inode_info), 0, 
-			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL, NULL);
+			sizeof(struct vxfs_inode_info), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
 	if (!vxfs_inode_cachep)
 		return -ENOMEM;
 	rv = register_filesystem(&vxfs_fs_type);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 357764d85ff1..3ad22beb24c2 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1044,7 +1044,7 @@ int __init fuse_dev_init(void)
 	int err = -ENOMEM;
 	fuse_req_cachep = kmem_cache_create("fuse_request",
 					    sizeof(struct fuse_req),
-					    0, 0, NULL, NULL);
+					    0, 0, NULL);
 	if (!fuse_req_cachep)
 		goto out;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index cc5efc13496a..5448f625ab56 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -706,7 +706,7 @@ static int __init fuse_fs_init(void)
 	fuse_inode_cachep = kmem_cache_create("fuse_inode",
 					      sizeof(struct fuse_inode),
 					      0, SLAB_HWCACHE_ALIGN,
-					      fuse_inode_init_once, NULL);
+					      fuse_inode_init_once);
 	err = -ENOMEM;
 	if (!fuse_inode_cachep)
 		goto out_unreg2;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 787a0edef100..d5d4e68b8807 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -72,7 +72,7 @@ static int __init init_gfs2_fs(void)
 	gfs2_glock_cachep = kmem_cache_create("gfs2_glock",
 					      sizeof(struct gfs2_glock),
 					      0, 0,
-					      gfs2_init_glock_once, NULL);
+					      gfs2_init_glock_once);
 	if (!gfs2_glock_cachep)
 		goto fail;
 
@@ -80,13 +80,13 @@ static int __init init_gfs2_fs(void)
 					      sizeof(struct gfs2_inode),
 					      0,  SLAB_RECLAIM_ACCOUNT|
 					          SLAB_MEM_SPREAD,
-					      gfs2_init_inode_once, NULL);
+					      gfs2_init_inode_once);
 	if (!gfs2_inode_cachep)
 		goto fail;
 
 	gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata",
 						sizeof(struct gfs2_bufdata),
-					        0, 0, NULL, NULL);
+					        0, 0, NULL);
 	if (!gfs2_bufdata_cachep)
 		goto fail;
 
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 92cf8751e428..6c5f92dfb500 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -443,7 +443,7 @@ static int __init init_hfs_fs(void)
 
 	hfs_inode_cachep = kmem_cache_create("hfs_inode_cache",
 		sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN,
-		hfs_init_once, NULL);
+		hfs_init_once);
 	if (!hfs_inode_cachep)
 		return -ENOMEM;
 	err = register_filesystem(&hfs_fs_type);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 6d87a2a9534d..7b0f2e5a44e2 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -479,7 +479,7 @@ static int __init init_hfsplus_fs(void)
 
 	hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache",
 		HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN,
-		hfsplus_init_once, NULL);
+		hfsplus_init_once);
 	if (!hfsplus_inode_cachep)
 		return -ENOMEM;
 	err = register_filesystem(&hfsplus_fs_type);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 29cc34abb2ea..89612ee7c80d 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -181,14 +181,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 	mutex_init(&ei->i_parent_mutex);
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache",
 					     sizeof(struct hpfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (hpfs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d145cb79c30a..c848a191525d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -848,7 +848,7 @@ static int __init init_hugetlbfs_fs(void)
 
 	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
 					sizeof(struct hugetlbfs_inode_info),
-					0, 0, init_once, NULL);
+					0, 0, init_once);
 	if (hugetlbfs_inode_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/fs/inode.c b/fs/inode.c
index 320e088d0b28..29f5068f819b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1388,8 +1388,7 @@ void __init inode_init(unsigned long mempages)
 					 0,
 					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
 					 SLAB_MEM_SPREAD),
-					 init_once,
-					 NULL);
+					 init_once);
 	register_shrinker(&icache_shrinker);
 
 	/* Hash may have been set up in inode_init_early */
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 9f2224f65a18..9bf2f6c09df6 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -716,10 +716,10 @@ static int __init inotify_user_setup(void)
 
 	watch_cachep = kmem_cache_create("inotify_watch_cache",
 					 sizeof(struct inotify_user_watch),
-					 0, SLAB_PANIC, NULL, NULL);
+					 0, SLAB_PANIC, NULL);
 	event_cachep = kmem_cache_create("inotify_event_cache",
 					 sizeof(struct inotify_kernel_event),
-					 0, SLAB_PANIC, NULL, NULL);
+					 0, SLAB_PANIC, NULL);
 
 	return 0;
 }
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 4f5418be0590..95c72aa81867 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -86,7 +86,7 @@ static int init_inodecache(void)
 					sizeof(struct iso_inode_info),
 					0, (SLAB_RECLAIM_ACCOUNT|
 					SLAB_MEM_SPREAD),
-					init_once, NULL);
+					init_once);
 	if (isofs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 46fe7439fb91..06ab3c10b1b8 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1668,7 +1668,7 @@ static int journal_create_jbd_slab(size_t slab_size)
 	 * boundary.
 	 */
 	jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
-				slab_size, slab_size, 0, NULL, NULL);
+				slab_size, slab_size, 0, NULL);
 	if (!jbd_slab[i]) {
 		printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
 		return -ENOMEM;
@@ -1711,8 +1711,7 @@ static int journal_init_journal_head_cache(void)
 				sizeof(struct journal_head),
 				0,		/* offset */
 				0,		/* flags */
-				NULL,		/* ctor */
-				NULL);		/* dtor */
+				NULL);		/* ctor */
 	retval = 0;
 	if (journal_head_cache == 0) {
 		retval = -ENOMEM;
@@ -2008,8 +2007,7 @@ static int __init journal_init_handle_cache(void)
 				sizeof(handle_t),
 				0,		/* offset */
 				0,		/* flags */
-				NULL,		/* ctor */
-				NULL);		/* dtor */
+				NULL);		/* ctor */
 	if (jbd_handle_cache == NULL) {
 		printk(KERN_EMERG "JBD: failed to create handle cache\n");
 		return -ENOMEM;
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 8db2fa25170b..62e13c8db132 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -170,13 +170,13 @@ int __init journal_init_revoke_caches(void)
 {
 	revoke_record_cache = kmem_cache_create("revoke_record",
 					   sizeof(struct jbd_revoke_record_s),
-					   0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+					   0, SLAB_HWCACHE_ALIGN, NULL);
 	if (revoke_record_cache == 0)
 		return -ENOMEM;
 
 	revoke_table_cache = kmem_cache_create("revoke_table",
 					   sizeof(struct jbd_revoke_table_s),
-					   0, 0, NULL, NULL);
+					   0, 0, NULL);
 	if (revoke_table_cache == 0) {
 		kmem_cache_destroy(revoke_record_cache);
 		revoke_record_cache = NULL;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f290cb7cb834..f37324aee817 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1680,7 +1680,7 @@ static int jbd2_journal_create_jbd_slab(size_t slab_size)
 	 * boundary.
 	 */
 	jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
-				slab_size, slab_size, 0, NULL, NULL);
+				slab_size, slab_size, 0, NULL);
 	if (!jbd_slab[i]) {
 		printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
 		return -ENOMEM;
@@ -1723,8 +1723,7 @@ static int journal_init_jbd2_journal_head_cache(void)
 				sizeof(struct journal_head),
 				0,		/* offset */
 				0,		/* flags */
-				NULL,		/* ctor */
-				NULL);		/* dtor */
+				NULL);		/* ctor */
 	retval = 0;
 	if (jbd2_journal_head_cache == 0) {
 		retval = -ENOMEM;
@@ -2006,8 +2005,7 @@ static int __init journal_init_handle_cache(void)
 				sizeof(handle_t),
 				0,		/* offset */
 				0,		/* flags */
-				NULL,		/* ctor */
-				NULL);		/* dtor */
+				NULL);		/* ctor */
 	if (jbd2_handle_cache == NULL) {
 		printk(KERN_EMERG "JBD: failed to create handle cache\n");
 		return -ENOMEM;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 28cac049a56b..01d88975e0c5 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -171,13 +171,13 @@ int __init jbd2_journal_init_revoke_caches(void)
 {
 	jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
 					   sizeof(struct jbd2_revoke_record_s),
-					   0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+					   0, SLAB_HWCACHE_ALIGN, NULL);
 	if (jbd2_revoke_record_cache == 0)
 		return -ENOMEM;
 
 	jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
 					   sizeof(struct jbd2_revoke_table_s),
-					   0, 0, NULL, NULL);
+					   0, 0, NULL);
 	if (jbd2_revoke_table_cache == 0) {
 		kmem_cache_destroy(jbd2_revoke_record_cache);
 		jbd2_revoke_record_cache = NULL;
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 35c1a5e30ba1..f9211252b5f1 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -33,56 +33,56 @@ int __init jffs2_create_slab_caches(void)
 {
 	full_dnode_slab = kmem_cache_create("jffs2_full_dnode",
 					    sizeof(struct jffs2_full_dnode),
-					    0, 0, NULL, NULL);
+					    0, 0, NULL);
 	if (!full_dnode_slab)
 		goto err;
 
 	raw_dirent_slab = kmem_cache_create("jffs2_raw_dirent",
 					    sizeof(struct jffs2_raw_dirent),
-					    0, 0, NULL, NULL);
+					    0, 0, NULL);
 	if (!raw_dirent_slab)
 		goto err;
 
 	raw_inode_slab = kmem_cache_create("jffs2_raw_inode",
 					   sizeof(struct jffs2_raw_inode),
-					   0, 0, NULL, NULL);
+					   0, 0, NULL);
 	if (!raw_inode_slab)
 		goto err;
 
 	tmp_dnode_info_slab = kmem_cache_create("jffs2_tmp_dnode",
 						sizeof(struct jffs2_tmp_dnode_info),
-						0, 0, NULL, NULL);
+						0, 0, NULL);
 	if (!tmp_dnode_info_slab)
 		goto err;
 
 	raw_node_ref_slab = kmem_cache_create("jffs2_refblock",
 					      sizeof(struct jffs2_raw_node_ref) * (REFS_PER_BLOCK + 1),
-					      0, 0, NULL, NULL);
+					      0, 0, NULL);
 	if (!raw_node_ref_slab)
 		goto err;
 
 	node_frag_slab = kmem_cache_create("jffs2_node_frag",
 					   sizeof(struct jffs2_node_frag),
-					   0, 0, NULL, NULL);
+					   0, 0, NULL);
 	if (!node_frag_slab)
 		goto err;
 
 	inode_cache_slab = kmem_cache_create("jffs2_inode_cache",
 					     sizeof(struct jffs2_inode_cache),
-					     0, 0, NULL, NULL);
+					     0, 0, NULL);
 	if (!inode_cache_slab)
 		goto err;
 
 #ifdef CONFIG_JFFS2_FS_XATTR
 	xattr_datum_cache = kmem_cache_create("jffs2_xattr_datum",
 					     sizeof(struct jffs2_xattr_datum),
-					     0, 0, NULL, NULL);
+					     0, 0, NULL);
 	if (!xattr_datum_cache)
 		goto err;
 
 	xattr_ref_cache = kmem_cache_create("jffs2_xattr_ref",
 					   sizeof(struct jffs2_xattr_ref),
-					   0, 0, NULL, NULL);
+					   0, 0, NULL);
 	if (!xattr_ref_cache)
 		goto err;
 #endif
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index e220d3bd610d..be2b70c2ec16 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -192,7 +192,7 @@ static int __init init_jffs2_fs(void)
 					     sizeof(struct jffs2_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     jffs2_i_init_once, NULL);
+					     jffs2_i_init_once);
 	if (!jffs2_inode_cachep) {
 		printk(KERN_ERR "JFFS2 error: Failed to initialise inode cache\n");
 		return -ENOMEM;
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 77c7f1129dde..62e96be02acf 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -213,7 +213,7 @@ int __init metapage_init(void)
 	 * Allocate the metapage structures
 	 */
 	metapage_cache = kmem_cache_create("jfs_mp", sizeof(struct metapage),
-					   0, 0, init_once, NULL);
+					   0, 0, init_once);
 	if (metapage_cache == NULL)
 		return -ENOMEM;
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 929fceca7999..4b372f550652 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -776,7 +776,7 @@ static int __init init_jfs_fs(void)
 	jfs_inode_cachep =
 	    kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0,
 			    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-			    init_once, NULL);
+			    init_once);
 	if (jfs_inode_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/fs/locks.c b/fs/locks.c
index 4f2d749ac624..310510637247 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2276,7 +2276,7 @@ static int __init filelock_init(void)
 {
 	filelock_cache = kmem_cache_create("file_lock_cache",
 			sizeof(struct file_lock), 0, SLAB_PANIC,
-			init_once, NULL);
+			init_once);
 	return 0;
 }
 
diff --git a/fs/mbcache.c b/fs/mbcache.c
index fbb1d02f8791..1046cbefbfbf 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -292,7 +292,7 @@ mb_cache_create(const char *name, struct mb_cache_op *cache_op,
 			INIT_LIST_HEAD(&cache->c_indexes_hash[m][n]);
 	}
 	cache->c_entry_cache = kmem_cache_create(name, entry_size, 0,
-		SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL, NULL);
+		SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
 	if (!cache->c_entry_cache)
 		goto fail;
 
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index be4044614ac8..43668d7d668f 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -75,14 +75,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	minix_inode_cachep = kmem_cache_create("minix_inode_cache",
 					     sizeof(struct minix_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (minix_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/namespace.c b/fs/namespace.c
index 4198003d7e18..ddbda13c2d31 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1801,7 +1801,7 @@ void __init mnt_init(unsigned long mempages)
 	init_rwsem(&namespace_sem);
 
 	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount),
-			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL);
+			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
 
 	mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
 
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index cf06eb9f050e..7f8536dbdedc 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -63,14 +63,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 	mutex_init(&ei->open_mutex);
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	ncp_inode_cachep = kmem_cache_create("ncp_inode_cache",
 					     sizeof(struct ncp_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (ncp_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index a5c82b6f3b45..fcf4d384610e 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -875,7 +875,7 @@ int __init nfs_init_directcache(void)
 						sizeof(struct nfs_direct_req),
 						0, (SLAB_RECLAIM_ACCOUNT|
 							SLAB_MEM_SPREAD),
-						NULL, NULL);
+						NULL);
 	if (nfs_direct_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 3d9fccf4ef93..bca6cdcb9f0d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1165,14 +1165,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 	nfsi->npages = 0;
 	nfs4_init_once(nfsi);
 }
- 
+
 static int __init nfs_init_inodecache(void)
 {
 	nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
 					     sizeof(struct nfs_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (nfs_inode_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index f56dae5216f4..345bb9b4765b 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -442,7 +442,7 @@ int __init nfs_init_nfspagecache(void)
 	nfs_page_cachep = kmem_cache_create("nfs_page",
 					    sizeof(struct nfs_page),
 					    0, SLAB_HWCACHE_ALIGN,
-					    NULL, NULL);
+					    NULL);
 	if (nfs_page_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 6ae2e58ed05a..19e05633f4e3 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -598,7 +598,7 @@ int __init nfs_init_readpagecache(void)
 	nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
 					     sizeof(struct nfs_read_data),
 					     0, SLAB_HWCACHE_ALIGN,
-					     NULL, NULL);
+					     NULL);
 	if (nfs_rdata_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 73ac992ece85..ef97e0c0f5b1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1467,7 +1467,7 @@ int __init nfs_init_writepagecache(void)
 	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
 					     sizeof(struct nfs_write_data),
 					     0, SLAB_HWCACHE_ALIGN,
-					     NULL, NULL);
+					     NULL);
 	if (nfs_wdata_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6284807bd37e..3f559700788f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1032,19 +1032,19 @@ static int
 nfsd4_init_slabs(void)
 {
 	stateowner_slab = kmem_cache_create("nfsd4_stateowners",
-			sizeof(struct nfs4_stateowner), 0, 0, NULL, NULL);
+			sizeof(struct nfs4_stateowner), 0, 0, NULL);
 	if (stateowner_slab == NULL)
 		goto out_nomem;
 	file_slab = kmem_cache_create("nfsd4_files",
-			sizeof(struct nfs4_file), 0, 0, NULL, NULL);
+			sizeof(struct nfs4_file), 0, 0, NULL);
 	if (file_slab == NULL)
 		goto out_nomem;
 	stateid_slab = kmem_cache_create("nfsd4_stateids",
-			sizeof(struct nfs4_stateid), 0, 0, NULL, NULL);
+			sizeof(struct nfs4_stateid), 0, 0, NULL);
 	if (stateid_slab == NULL)
 		goto out_nomem;
 	deleg_slab = kmem_cache_create("nfsd4_delegations",
-			sizeof(struct nfs4_delegation), 0, 0, NULL, NULL);
+			sizeof(struct nfs4_delegation), 0, 0, NULL);
 	if (deleg_slab == NULL)
 		goto out_nomem;
 	return 0;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 4566b9182551..90c4e3a29706 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3143,7 +3143,7 @@ static int __init init_ntfs_fs(void)
 
 	ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name,
 			sizeof(ntfs_index_context), 0 /* offset */,
-			SLAB_HWCACHE_ALIGN, NULL /* ctor */, NULL /* dtor */);
+			SLAB_HWCACHE_ALIGN, NULL /* ctor */);
 	if (!ntfs_index_ctx_cache) {
 		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
 				ntfs_index_ctx_cache_name);
@@ -3151,7 +3151,7 @@ static int __init init_ntfs_fs(void)
 	}
 	ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name,
 			sizeof(ntfs_attr_search_ctx), 0 /* offset */,
-			SLAB_HWCACHE_ALIGN, NULL /* ctor */, NULL /* dtor */);
+			SLAB_HWCACHE_ALIGN, NULL /* ctor */);
 	if (!ntfs_attr_ctx_cache) {
 		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
 				ntfs_attr_ctx_cache_name);
@@ -3160,7 +3160,7 @@ static int __init init_ntfs_fs(void)
 
 	ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name,
 			(NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0,
-			SLAB_HWCACHE_ALIGN, NULL, NULL);
+			SLAB_HWCACHE_ALIGN, NULL);
 	if (!ntfs_name_cache) {
 		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
 				ntfs_name_cache_name);
@@ -3169,7 +3169,7 @@ static int __init init_ntfs_fs(void)
 
 	ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name,
 			sizeof(ntfs_inode), 0,
-			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL, NULL);
+			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
 	if (!ntfs_inode_cache) {
 		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
 				ntfs_inode_cache_name);
@@ -3179,7 +3179,7 @@ static int __init init_ntfs_fs(void)
 	ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
 			sizeof(big_ntfs_inode), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-			ntfs_big_inode_init_once, NULL);
+			ntfs_big_inode_init_once);
 	if (!ntfs_big_inode_cache) {
 		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
 				ntfs_big_inode_cache_name);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index fd8cb1badc9b..7418dc83de1c 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -592,7 +592,7 @@ static int __init init_dlmfs_fs(void)
 				sizeof(struct dlmfs_inode_private),
 				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 					SLAB_MEM_SPREAD),
-				dlmfs_init_once, NULL);
+				dlmfs_init_once);
 	if (!dlmfs_inode_cache)
 		return -ENOMEM;
 	cleanup_inode = 1;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 65b2b9b92688..62e4a7daa286 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -510,7 +510,7 @@ int dlm_init_mle_cache(void)
 	dlm_mle_cache = kmem_cache_create("dlm_mle_cache",
 					  sizeof(struct dlm_master_list_entry),
 					  0, SLAB_HWCACHE_ALIGN,
-					  NULL, NULL);
+					  NULL);
 	if (dlm_mle_cache == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3a5a1ed09ac9..200c7d4790dc 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -984,7 +984,7 @@ static int ocfs2_initialize_mem_caches(void)
 				       0,
 				       (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-				       ocfs2_inode_init_once, NULL);
+				       ocfs2_inode_init_once);
 	if (!ocfs2_inode_cachep)
 		return -ENOMEM;
 
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 39814b900fc0..4da8851f2b23 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -548,7 +548,7 @@ int __init init_ocfs2_uptodate_cache(void)
 {
 	ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
 				  sizeof(struct ocfs2_meta_cache_item),
-				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+				  0, SLAB_HWCACHE_ALIGN, NULL);
 	if (!ocfs2_uptodate_cachep)
 		return -ENOMEM;
 
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index e62397341c36..dd86be2aa6c9 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -431,7 +431,7 @@ static int __init init_openprom_fs(void)
 					    0,
 					    (SLAB_RECLAIM_ACCOUNT |
 					     SLAB_MEM_SPREAD),
-					    op_inode_init_once, NULL);
+					    op_inode_init_once);
 	if (!op_inode_cachep)
 		return -ENOMEM;
 
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index dd28e86ab422..94e2c1adf184 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -112,14 +112,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 int __init proc_init_inodecache(void)
 {
 	proc_inode_cachep = kmem_cache_create("proc_inode_cache",
 					     sizeof(struct proc_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (proc_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 8d256eb11813..1bc8d873a9e1 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -545,7 +545,7 @@ static int init_inodecache(void)
 					     sizeof(struct qnx4_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (qnx4_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 5a93cfe1a032..5b68dd3f191a 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -527,7 +527,7 @@ static int init_inodecache(void)
 							 reiserfs_inode_info),
 						  0, (SLAB_RECLAIM_ACCOUNT|
 							SLAB_MEM_SPREAD),
-						  init_once, NULL);
+						  init_once);
 	if (reiserfs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 2284e03342c6..dae7945f90e4 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -572,14 +572,14 @@ static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	romfs_inode_cachep = kmem_cache_create("romfs_inode_cache",
 					     sizeof(struct romfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (romfs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 6724a6cf01ff..73d1450a95d4 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -73,14 +73,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	smb_inode_cachep = kmem_cache_create("smb_inode_cache",
 					     sizeof(struct smb_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (smb_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index 3f54a0f80fae..ca4b2d59c0ca 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -40,7 +40,7 @@ int smb_init_request_cache(void)
 	req_cachep = kmem_cache_create("smb_request",
 				       sizeof(struct smb_request), 0,
 				       SMB_SLAB_DEBUG | SLAB_HWCACHE_ALIGN,
-				       NULL, NULL);
+				       NULL);
 	if (req_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 60714d075c2f..fbc7b65fe262 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -86,7 +86,7 @@ int __init sysfs_init(void)
 
 	sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
 					      sizeof(struct sysfs_dirent),
-					      0, 0, NULL, NULL);
+					      0, 0, NULL);
 	if (!sysfs_dir_cachep)
 		goto out;
 
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 564411693394..7c4e5d302abb 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -342,7 +342,7 @@ int __init sysv_init_icache(void)
 	sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
 			sizeof(struct sysv_inode_info), 0,
 			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-			init_once, NULL);
+			init_once);
 	if (!sysv_inode_cachep)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 911387aa1810..72097ee6b752 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -149,7 +149,7 @@ static int init_inodecache(void)
 					     sizeof(struct udf_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT |
 						 SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (udf_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 2b3011689e89..73402c5eeb8a 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1240,14 +1240,14 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
 static int init_inodecache(void)
 {
 	ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
 					     sizeof(struct ufs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once);
 	if (ufs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 4b6470cf87f0..b4acc7f3c374 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -74,14 +74,14 @@ extern void  kmem_free(void *, size_t);
 static inline kmem_zone_t *
 kmem_zone_init(int size, char *zone_name)
 {
-	return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL);
+	return kmem_cache_create(zone_name, size, 0, 0, NULL);
 }
 
 static inline kmem_zone_t *
 kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
 		     void (*construct)(void *, kmem_zone_t *, unsigned long))
 {
-	return kmem_cache_create(zone_name, size, 0, flags, construct, NULL);
+	return kmem_cache_create(zone_name, size, 0, flags, construct);
 }
 
 static inline void
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 333a370a3bdc..9752307d16ba 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -946,8 +946,7 @@ static inline int i2o_pool_alloc(struct i2o_pool *pool, const char *name,
 	strcpy(pool->name, name);
 
 	pool->slab =
-	    kmem_cache_create(pool->name, size, 0, SLAB_HWCACHE_ALIGN, NULL,
-			      NULL);
+	    kmem_cache_create(pool->name, size, 0, SLAB_HWCACHE_ALIGN, NULL);
 	if (!pool->slab)
 		goto free_name;
 
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 0e1d0daef6a2..7d0ecc1659f0 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -51,7 +51,6 @@ int slab_is_available(void);
 
 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			unsigned long,
-			void (*)(void *, struct kmem_cache *, unsigned long),
 			void (*)(void *, struct kmem_cache *, unsigned long));
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
@@ -70,7 +69,7 @@ int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr);
  */
 #define KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\
 		sizeof(struct __struct), __alignof__(struct __struct),\
-		(__flags), NULL, NULL)
+		(__flags), NULL)
 
 /*
  * The largest kmalloc size supported by the slab allocators is
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index a242c83d89d6..145d5a0d299f 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1253,7 +1253,7 @@ static int __init init_mqueue_fs(void)
 
 	mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
 				sizeof(struct mqueue_inode_info), 0,
-				SLAB_HWCACHE_ALIGN, init_once, NULL);
+				SLAB_HWCACHE_ALIGN, init_once);
 	if (mqueue_inode_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 469838998220..7332e236d367 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -137,7 +137,7 @@ void __init fork_init(unsigned long mempages)
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep =
 		kmem_cache_create("task_struct", sizeof(struct task_struct),
-			ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL);
+			ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
 #endif
 
 	/*
@@ -1446,22 +1446,22 @@ void __init proc_caches_init(void)
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
-			sighand_ctor, NULL);
+			sighand_ctor);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
-	files_cachep = kmem_cache_create("files_cache", 
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	files_cachep = kmem_cache_create("files_cache",
 			sizeof(struct files_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
-	fs_cachep = kmem_cache_create("fs_cache", 
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 	vm_area_cachep = kmem_cache_create("vm_area_struct",
 			sizeof(struct vm_area_struct), 0,
-			SLAB_PANIC, NULL, NULL);
+			SLAB_PANIC, NULL);
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 }
 
 /*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 10f0bbba382b..a4fb7d46971f 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -193,7 +193,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 static int __init nsproxy_cache_init(void)
 {
 	nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy),
-					   0, SLAB_PANIC, NULL, NULL);
+					   0, SLAB_PANIC, NULL);
 	return 0;
 }
 
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 329ce0172074..55b3761edaa9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -241,7 +241,7 @@ static __init int init_posix_timers(void)
 	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
 
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
-					sizeof (struct k_itimer), 0, 0, NULL, NULL);
+					sizeof (struct k_itimer), 0, 0, NULL);
 	idr_init(&posix_timers_id);
 	return 0;
 }
diff --git a/kernel/user.c b/kernel/user.c
index 98b82507797a..e7d11cef6998 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -208,7 +208,7 @@ static int __init uid_cache_init(void)
 	int n;
 
 	uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
-			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	for(n = 0; n < UIDHASH_SZ; ++n)
 		INIT_LIST_HEAD(init_user_ns.uidhash_table + n);
diff --git a/lib/idr.c b/lib/idr.c
index 5ca67b3cfd35..ffd61941e75d 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -590,7 +590,7 @@ static  int init_id_cache(void)
 {
 	if (!idr_layer_cache)
 		idr_layer_cache = kmem_cache_create("idr_layer_cache",
-			sizeof(struct idr_layer), 0, 0, idr_cache_ctor, NULL);
+			sizeof(struct idr_layer), 0, 0, idr_cache_ctor);
 	return 0;
 }
 
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 9927cca14cb7..514efb200be6 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1021,7 +1021,7 @@ void __init radix_tree_init(void)
 {
 	radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
 			sizeof(struct radix_tree_node), 0,
-			SLAB_PANIC, radix_tree_node_ctor, NULL);
+			SLAB_PANIC, radix_tree_node_ctor);
 	radix_tree_init_maxindex();
 	hotcpu_notifier(radix_tree_callback, 0);
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9f4e9b95e8f2..71b84b45154a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1605,11 +1605,11 @@ void __init numa_policy_init(void)
 
 	policy_cache = kmem_cache_create("numa_policy",
 					 sizeof(struct mempolicy),
-					 0, SLAB_PANIC, NULL, NULL);
+					 0, SLAB_PANIC, NULL);
 
 	sn_cache = kmem_cache_create("shared_policy_node",
 				     sizeof(struct sp_node),
-				     0, SLAB_PANIC, NULL, NULL);
+				     0, SLAB_PANIC, NULL);
 
 	/*
 	 * Set interleaving policy for system init. Interleaving is only
diff --git a/mm/rmap.c b/mm/rmap.c
index fede5c7910be..41ac39749ef4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -149,7 +149,7 @@ static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
 void __init anon_vma_init(void)
 {
 	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
-			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
+			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
 }
 
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index ad155c7745dc..fcd19d323f9f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2322,7 +2322,7 @@ static int init_inodecache(void)
 {
 	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
 				sizeof(struct shmem_inode_info),
-				0, 0, init_once, NULL);
+				0, 0, init_once);
 	if (shmem_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/mm/slab.c b/mm/slab.c
index c3feeaab3875..bde271c001ba 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1484,7 +1484,7 @@ void __init kmem_cache_init(void)
 					sizes[INDEX_AC].cs_size,
 					ARCH_KMALLOC_MINALIGN,
 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
-					NULL, NULL);
+					NULL);
 
 	if (INDEX_AC != INDEX_L3) {
 		sizes[INDEX_L3].cs_cachep =
@@ -1492,7 +1492,7 @@ void __init kmem_cache_init(void)
 				sizes[INDEX_L3].cs_size,
 				ARCH_KMALLOC_MINALIGN,
 				ARCH_KMALLOC_FLAGS|SLAB_PANIC,
-				NULL, NULL);
+				NULL);
 	}
 
 	slab_early_init = 0;
@@ -1510,7 +1510,7 @@ void __init kmem_cache_init(void)
 					sizes->cs_size,
 					ARCH_KMALLOC_MINALIGN,
 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
-					NULL, NULL);
+					NULL);
 		}
 #ifdef CONFIG_ZONE_DMA
 		sizes->cs_dmacachep = kmem_cache_create(
@@ -1519,7 +1519,7 @@ void __init kmem_cache_init(void)
 					ARCH_KMALLOC_MINALIGN,
 					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
 						SLAB_PANIC,
-					NULL, NULL);
+					NULL);
 #endif
 		sizes++;
 		names++;
@@ -2101,12 +2101,10 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
  * @align: The required alignment for the objects.
  * @flags: SLAB flags
  * @ctor: A constructor for the objects.
- * @dtor: A destructor for the objects (not implemented anymore).
  *
  * Returns a ptr to the cache on success, NULL on failure.
  * Cannot be called within a int, but can be interrupted.
- * The @ctor is run when new pages are allocated by the cache
- * and the @dtor is run before the pages are handed back.
+ * The @ctor is run when new pages are allocated by the cache.
  *
  * @name must be valid until the cache is destroyed. This implies that
  * the module calling this has to destroy the cache before getting unloaded.
@@ -2126,8 +2124,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
 struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
 	unsigned long flags,
-	void (*ctor)(void*, struct kmem_cache *, unsigned long),
-	void (*dtor)(void*, struct kmem_cache *, unsigned long))
+	void (*ctor)(void*, struct kmem_cache *, unsigned long))
 {
 	size_t left_over, slab_size, ralign;
 	struct kmem_cache *cachep = NULL, *pc;
@@ -2136,7 +2133,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 * Sanity checks... these are all serious usage bugs.
 	 */
 	if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
-	    size > KMALLOC_MAX_SIZE || dtor) {
+	    size > KMALLOC_MAX_SIZE) {
 		printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
 				name);
 		BUG();
diff --git a/mm/slob.c b/mm/slob.c
index c89ef116d7aa..d50920ecc02b 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -492,8 +492,7 @@ struct kmem_cache {
 
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 	size_t align, unsigned long flags,
-	void (*ctor)(void*, struct kmem_cache *, unsigned long),
-	void (*dtor)(void*, struct kmem_cache *, unsigned long))
+	void (*ctor)(void*, struct kmem_cache *, unsigned long))
 {
 	struct kmem_cache *c;
 
diff --git a/mm/slub.c b/mm/slub.c
index 322f3a5d72c7..9b2d6178d06c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2668,12 +2668,10 @@ static struct kmem_cache *find_mergeable(size_t size,
 
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 		size_t align, unsigned long flags,
-		void (*ctor)(void *, struct kmem_cache *, unsigned long),
-		void (*dtor)(void *, struct kmem_cache *, unsigned long))
+		void (*ctor)(void *, struct kmem_cache *, unsigned long))
 {
 	struct kmem_cache *s;
 
-	BUG_ON(dtor);
 	down_write(&slub_lock);
 	s = find_mergeable(size, align, flags, ctor);
 	if (s) {
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 3fc697293819..69b70977f000 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -36,7 +36,7 @@ int __init br_fdb_init(void)
 	br_fdb_cache = kmem_cache_create("bridge_fdb_cache",
 					 sizeof(struct net_bridge_fdb_entry),
 					 0,
-					 SLAB_HWCACHE_ALIGN, NULL, NULL);
+					 SLAB_HWCACHE_ALIGN, NULL);
 	if (!br_fdb_cache)
 		return -ENOMEM;
 
diff --git a/net/core/flow.c b/net/core/flow.c
index 051430545a05..0ab5234b17d8 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -350,7 +350,7 @@ static int __init flow_cache_init(void)
 	flow_cachep = kmem_cache_create("flow_cache",
 					sizeof(struct flow_cache_entry),
 					0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-					NULL, NULL);
+					NULL);
 	flow_hash_shift = 10;
 	flow_lwm = 2 * flow_hash_size;
 	flow_hwm = 4 * flow_hash_size;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 9df26a07f067..ca2a1533138a 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1347,7 +1347,7 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
 		tbl->kmem_cachep =
 			kmem_cache_create(tbl->id, tbl->entry_size, 0,
 					  SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-					  NULL, NULL);
+					  NULL);
 	tbl->stats = alloc_percpu(struct neigh_statistics);
 	if (!tbl->stats)
 		panic("cannot create neighbour cache statistics");
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 0583e8498f13..35021eb3ed07 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2021,13 +2021,13 @@ void __init skb_init(void)
 					      sizeof(struct sk_buff),
 					      0,
 					      SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-					      NULL, NULL);
+					      NULL);
 	skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
 						(2*sizeof(struct sk_buff)) +
 						sizeof(atomic_t),
 						0,
 						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-						NULL, NULL);
+						NULL);
 }
 
 /**
diff --git a/net/core/sock.c b/net/core/sock.c
index 239a08a6ff24..bd209c4477a9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1767,7 +1767,7 @@ int proto_register(struct proto *prot, int alloc_slab)
 
 	if (alloc_slab) {
 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
-					       SLAB_HWCACHE_ALIGN, NULL, NULL);
+					       SLAB_HWCACHE_ALIGN, NULL);
 
 		if (prot->slab == NULL) {
 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
@@ -1785,7 +1785,7 @@ int proto_register(struct proto *prot, int alloc_slab)
 			sprintf(request_sock_slab_name, mask, prot->name);
 			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
 								 prot->rsk_prot->obj_size, 0,
-								 SLAB_HWCACHE_ALIGN, NULL, NULL);
+								 SLAB_HWCACHE_ALIGN, NULL);
 
 			if (prot->rsk_prot->slab == NULL) {
 				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
@@ -1807,7 +1807,7 @@ int proto_register(struct proto *prot, int alloc_slab)
 				kmem_cache_create(timewait_sock_slab_name,
 						  prot->twsk_prot->twsk_obj_size,
 						  0, SLAB_HWCACHE_ALIGN,
-						  NULL, NULL);
+						  NULL);
 			if (prot->twsk_prot->twsk_slab == NULL)
 				goto out_free_timewait_sock_slab_name;
 		}
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 01030f346177..7ac775f9a64b 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -481,14 +481,14 @@ int __init dccp_ackvec_init(void)
 {
 	dccp_ackvec_slab = kmem_cache_create("dccp_ackvec",
 					     sizeof(struct dccp_ackvec), 0,
-					     SLAB_HWCACHE_ALIGN, NULL, NULL);
+					     SLAB_HWCACHE_ALIGN, NULL);
 	if (dccp_ackvec_slab == NULL)
 		goto out_err;
 
 	dccp_ackvec_record_slab =
 			kmem_cache_create("dccp_ackvec_record",
 					  sizeof(struct dccp_ackvec_record),
-					  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+					  0, SLAB_HWCACHE_ALIGN, NULL);
 	if (dccp_ackvec_record_slab == NULL)
 		goto out_destroy_slab;
 
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index d8cf92f09e68..ccbf72c793b6 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -69,7 +69,7 @@ static struct kmem_cache *ccid_kmem_cache_create(int obj_size, const char *fmt,.
 	if (slab_name == NULL)
 		return NULL;
 	slab = kmem_cache_create(slab_name, sizeof(struct ccid) + obj_size, 0,
-				 SLAB_HWCACHE_ALIGN, NULL, NULL);
+				 SLAB_HWCACHE_ALIGN, NULL);
 	if (slab == NULL)
 		kfree(slab_name);
 	return slab;
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index dd0fc992b042..174d3f13d93f 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -282,7 +282,7 @@ static __init int dccp_li_init(void)
 {
 	dccp_li_cachep = kmem_cache_create("dccp_li_hist",
 					   sizeof(struct dccp_li_hist_entry),
-					   0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+					   0, SLAB_HWCACHE_ALIGN, NULL);
 	return dccp_li_cachep == NULL ? -ENOBUFS : 0;
 }
 
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 2e8ef42721e2..34c4f6047724 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -59,7 +59,7 @@ struct dccp_tx_hist *dccp_tx_hist_new(const char *name)
 	hist->dccptxh_slab = kmem_cache_create(slab_name,
 					     sizeof(struct dccp_tx_hist_entry),
 					       0, SLAB_HWCACHE_ALIGN,
-					       NULL, NULL);
+					       NULL);
 	if (hist->dccptxh_slab == NULL)
 		goto out_free_slab_name;
 out:
@@ -148,7 +148,7 @@ struct dccp_rx_hist *dccp_rx_hist_new(const char *name)
 	hist->dccprxh_slab = kmem_cache_create(slab_name,
 					     sizeof(struct dccp_rx_hist_entry),
 					       0, SLAB_HWCACHE_ALIGN,
-					       NULL, NULL);
+					       NULL);
 	if (hist->dccprxh_slab == NULL)
 		goto out_free_slab_name;
 out:
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 6607b7b14f34..04b59ec4f512 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1003,7 +1003,7 @@ static int __init dccp_init(void)
 	dccp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("dccp_bind_bucket",
 				  sizeof(struct inet_bind_bucket), 0,
-				  SLAB_HWCACHE_ALIGN, NULL, NULL);
+				  SLAB_HWCACHE_ALIGN, NULL);
 	if (!dccp_hashinfo.bind_bucket_cachep)
 		goto out;
 
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 82622fb6f68f..f2a61ef2af9c 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1770,7 +1770,7 @@ void __init dn_route_init(void)
 
 	dn_dst_ops.kmem_cachep =
 		kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0,
-				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 	init_timer(&dn_route_timer);
 	dn_route_timer.function = dn_dst_check_expire;
 	dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ;
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index d6615c9361e9..fda0772fa215 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -881,7 +881,7 @@ void __init dn_fib_table_init(void)
 	dn_hash_kmem = kmem_cache_create("dn_fib_info_cache",
 					sizeof(struct dn_fib_info),
 					0, SLAB_HWCACHE_ALIGN,
-					NULL, NULL);
+					NULL);
 }
 
 void __exit dn_fib_table_cleanup(void)
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 07e843a47dde..9ad1d9ff9ce7 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -771,13 +771,13 @@ struct fib_table * __init fib_hash_init(u32 id)
 		fn_hash_kmem = kmem_cache_create("ip_fib_hash",
 						 sizeof(struct fib_node),
 						 0, SLAB_HWCACHE_ALIGN,
-						 NULL, NULL);
+						 NULL);
 
 	if (fn_alias_kmem == NULL)
 		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
 						  sizeof(struct fib_alias),
 						  0, SLAB_HWCACHE_ALIGN,
-						  NULL, NULL);
+						  NULL);
 
 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
 		     GFP_KERNEL);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 30e332ade61b..9ca786a6fd3c 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1970,7 +1970,7 @@ struct fib_table * __init fib_hash_init(u32 id)
 		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
 						  sizeof(struct fib_alias),
 						  0, SLAB_HWCACHE_ALIGN,
-						  NULL, NULL);
+						  NULL);
 
 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
 		     GFP_KERNEL);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 2f44e6128068..6cbce96a54ce 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -123,7 +123,7 @@ void __init inet_initpeers(void)
 	peer_cachep = kmem_cache_create("inet_peer_cache",
 			sizeof(struct inet_peer),
 			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-			NULL, NULL);
+			NULL);
 
 	/* All the timers, started at system startup tend
 	   to synchronize. Perturb it a bit.
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index d96582acdf69..7003cc1b7fe2 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1917,7 +1917,7 @@ void __init ip_mr_init(void)
 	mrt_cachep = kmem_cache_create("ip_mrt_cache",
 				       sizeof(struct mfc_cache),
 				       0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-				       NULL, NULL);
+				       NULL);
 	init_timer(&ipmr_expire_timer);
 	ipmr_expire_timer.function=ipmr_expire_process;
 	register_netdevice_notifier(&ip_mr_notifier);
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 3b446b1a6b9c..d612a6a5d957 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -901,7 +901,7 @@ int ip_vs_conn_init(void)
 	/* Allocate ip_vs_conn slab cache */
 	ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
 					      sizeof(struct ip_vs_conn), 0,
-					      SLAB_HWCACHE_ALIGN, NULL, NULL);
+					      SLAB_HWCACHE_ALIGN, NULL);
 	if (!ip_vs_conn_cachep) {
 		vfree(ip_vs_conn_tab);
 		return -ENOMEM;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 88fa648d7ba3..df42b7fb3268 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2967,7 +2967,7 @@ int __init ip_rt_init(void)
 
 	ipv4_dst_ops.kmem_cachep =
 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
-				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 987b94403be5..da4c0b6ab79a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2430,7 +2430,7 @@ void __init tcp_init(void)
 	tcp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("tcp_bind_bucket",
 				  sizeof(struct inet_bind_bucket), 0,
-				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	/* Size and allocate the main established and bind bucket
 	 * hash tables.
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 662a7d9681fd..6a612a701eaa 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1474,7 +1474,7 @@ void __init fib6_init(void)
 	fib6_node_kmem = kmem_cache_create("fib6_nodes",
 					   sizeof(struct fib6_node),
 					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-					   NULL, NULL);
+					   NULL);
 
 	fib6_tables_init();
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index fe8d9837f9f8..919de682b331 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2555,7 +2555,7 @@ void __init ip6_route_init(void)
 #endif
 	ip6_dst_ops.kmem_cachep =
 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
-				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops.kmem_cachep;
 
 	fib6_init();
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 6f87dd568ded..30f3236c402a 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -84,7 +84,7 @@ static int xfrm6_tunnel_spi_init(void)
 	xfrm6_tunnel_spi_kmem = kmem_cache_create("xfrm6_tunnel_spi",
 						  sizeof(struct xfrm6_tunnel_spi),
 						  0, SLAB_HWCACHE_ALIGN,
-						  NULL, NULL);
+						  NULL);
 	if (!xfrm6_tunnel_spi_kmem)
 		return -ENOMEM;
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 8cce814f6bee..aa086c83af80 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1108,7 +1108,7 @@ int __init nf_conntrack_init(void)
 
 	nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
 						sizeof(struct nf_conn),
-						0, 0, NULL, NULL);
+						0, 0, NULL);
 	if (!nf_conntrack_cachep) {
 		printk(KERN_ERR "Unable to create nf_conn slab cache\n");
 		goto err_free_hash;
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 2191fe008f60..1aa6229ca99f 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -540,7 +540,7 @@ int __init nf_conntrack_expect_init(void)
 
 	nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
 					sizeof(struct nf_conntrack_expect),
-					0, 0, NULL, NULL);
+					0, 0, NULL);
 	if (!nf_ct_expect_cachep)
 		goto err2;
 
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index d6b3d01975b6..bd45f9d3f7d0 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -738,7 +738,7 @@ static int __init xt_hashlimit_init(void)
 	err = -ENOMEM;
 	hashlimit_cachep = kmem_cache_create("xt_hashlimit",
 					    sizeof(struct dsthash_ent), 0, 0,
-					    NULL, NULL);
+					    NULL);
 	if (!hashlimit_cachep) {
 		printk(KERN_ERR "xt_hashlimit: unable to create slab cache\n");
 		goto err2;
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 46f6d572ad2d..16a68df4e36b 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -792,7 +792,7 @@ static int __init af_rxrpc_init(void)
 	ret = -ENOMEM;
 	rxrpc_call_jar = kmem_cache_create(
 		"rxrpc_call_jar", sizeof(struct rxrpc_call), 0,
-		SLAB_HWCACHE_ALIGN, NULL, NULL);
+		SLAB_HWCACHE_ALIGN, NULL);
 	if (!rxrpc_call_jar) {
 		printk(KERN_NOTICE "RxRPC: Failed to allocate call jar\n");
 		goto error_call_jar;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 34bab36637ac..e98579b788b8 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -980,14 +980,14 @@ SCTP_STATIC __init int sctp_init(void)
 	sctp_bucket_cachep = kmem_cache_create("sctp_bind_bucket",
 					       sizeof(struct sctp_bind_bucket),
 					       0, SLAB_HWCACHE_ALIGN,
-					       NULL, NULL);
+					       NULL);
 	if (!sctp_bucket_cachep)
 		goto out;
 
 	sctp_chunk_cachep = kmem_cache_create("sctp_chunk",
 					       sizeof(struct sctp_chunk),
 					       0, SLAB_HWCACHE_ALIGN,
-					       NULL, NULL);
+					       NULL);
 	if (!sctp_chunk_cachep)
 		goto err_chunk_cachep;
 
diff --git a/net/socket.c b/net/socket.c
index b71114250046..ec077037f534 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -272,8 +272,7 @@ static int init_inodecache(void)
 					      (SLAB_HWCACHE_ALIGN |
 					       SLAB_RECLAIM_ACCOUNT |
 					       SLAB_MEM_SPREAD),
-					      init_once,
-					      NULL);
+					      init_once);
 	if (sock_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 5b2b6fb244f2..650af064ff8d 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -867,7 +867,7 @@ int register_rpc_pipefs(void)
 				sizeof(struct rpc_inode),
 				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-				init_once, NULL);
+				init_once);
 	if (!rpc_inode_cachep)
 		return -ENOMEM;
 	err = register_filesystem(&rpc_pipe_fs_type);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 2ac43c41c3a9..b5723c262a3e 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1031,13 +1031,13 @@ rpc_init_mempool(void)
 	rpc_task_slabp = kmem_cache_create("rpc_tasks",
 					     sizeof(struct rpc_task),
 					     0, SLAB_HWCACHE_ALIGN,
-					     NULL, NULL);
+					     NULL);
 	if (!rpc_task_slabp)
 		goto err_nomem;
 	rpc_buffer_slabp = kmem_cache_create("rpc_buffers",
 					     RPC_BUFFER_MAXSIZE,
 					     0, SLAB_HWCACHE_ALIGN,
-					     NULL, NULL);
+					     NULL);
 	if (!rpc_buffer_slabp)
 		goto err_nomem;
 	rpc_task_mempool = mempool_create_slab_pool(RPC_TASK_POOLSIZE,
diff --git a/net/tipc/handler.c b/net/tipc/handler.c
index e1dcf663f8a6..0c70010a7dfe 100644
--- a/net/tipc/handler.c
+++ b/net/tipc/handler.c
@@ -97,7 +97,7 @@ int tipc_handler_start(void)
 {
 	tipc_queue_item_cache =
 		kmem_cache_create("tipc_queue_items", sizeof(struct queue_item),
-				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+				  0, SLAB_HWCACHE_ALIGN, NULL);
 	if (!tipc_queue_item_cache)
 		return -ENOMEM;
 
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 5c4695840c58..113f44429982 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -83,5 +83,5 @@ void __init xfrm_input_init(void)
 	secpath_cachep = kmem_cache_create("secpath_cache",
 					   sizeof(struct sec_path),
 					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-					   NULL, NULL);
+					   NULL);
 }
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index cfaf17c8851e..c3a4b0a18687 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2378,7 +2378,7 @@ static void __init xfrm_policy_init(void)
 	xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
 					   sizeof(struct xfrm_dst),
 					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-					   NULL, NULL);
+					   NULL);
 
 	hmask = 8 - 1;
 	sz = (hmask+1) * sizeof(struct hlist_head);
diff --git a/security/keys/key.c b/security/keys/key.c
index 700400d801dc..01bbc6d9d19b 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -1001,7 +1001,7 @@ void __init key_init(void)
 {
 	/* allocate a slab in which we can store keys */
 	key_jar = kmem_cache_create("key_jar", sizeof(struct key),
-			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	/* add the special key types */
 	list_add_tail(&key_type_keyring.link, &key_types_list);
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 78c408fd2b02..ecd067384531 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -239,7 +239,7 @@ void __init avc_init(void)
 	atomic_set(&avc_cache.lru_hint, 0);
 
 	avc_node_cachep = kmem_cache_create("avc_node", sizeof(struct avc_node),
-					     0, SLAB_PANIC, NULL, NULL);
+					     0, SLAB_PANIC, NULL);
 
 	audit_log(current->audit_context, GFP_KERNEL, AUDIT_KERNEL, "AVC INITIALIZED\n");
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 26356e67108e..0fac6829c63a 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4913,7 +4913,7 @@ static __init int selinux_init(void)
 
 	sel_inode_cache = kmem_cache_create("selinux_inode_security",
 					    sizeof(struct inode_security_struct),
-					    0, SLAB_PANIC, NULL, NULL);
+					    0, SLAB_PANIC, NULL);
 	avc_init();
 
 	original_ops = secondary_ops = security_ops;
diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c
index 3122908afdc1..85705eb289e0 100644
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -445,7 +445,7 @@ void avtab_cache_init(void)
 {
 	avtab_node_cachep = kmem_cache_create("avtab_node",
 					      sizeof(struct avtab_node),
-					      0, SLAB_PANIC, NULL, NULL);
+					      0, SLAB_PANIC, NULL);
 }
 
 void avtab_cache_destroy(void)
-- 
cgit v1.3-8-gc7d7


From 76c1ce7870fd9b05431da1bbd47fdafcc029a25b Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 1 May 2007 16:19:07 +1000
Subject: Split out common parts of prom.h

This creates linux/of.h and includes asm/prom.h from it.

We also include linux/of.h from asm/prom.h while we transition.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Paul Mackerras <paulus@samba.org>
Acked-by: David S. Miller <davem@davemloft.net>
---
 include/asm-powerpc/prom.h | 43 +++++---------------------------
 include/asm-sparc/prom.h   | 46 +++++-----------------------------
 include/asm-sparc64/prom.h | 46 +++++-----------------------------
 include/linux/of.h         | 61 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 79 insertions(+), 117 deletions(-)
 create mode 100644 include/linux/of.h

(limited to 'include/linux')

diff --git a/include/asm-powerpc/prom.h b/include/asm-powerpc/prom.h
index 1632baa17dc6..1134aeab2327 100644
--- a/include/asm-powerpc/prom.h
+++ b/include/asm-powerpc/prom.h
@@ -97,10 +97,6 @@ struct device_node {
 
 extern struct device_node *of_chosen;
 
-/* flag descriptions */
-#define OF_DYNAMIC	1 /* node and properties were allocated via kmalloc */
-#define OF_DETACHED	2 /* node has been detached from the device tree */
-
 static inline int of_node_check_flag(struct device_node *n, unsigned long flag)
 {
 	return test_bit(flag, &n->_flags);
@@ -120,31 +116,7 @@ static inline void set_node_proc_entry(struct device_node *dn, struct proc_dir_e
 }
 
 
-/* New style node lookup */
-extern struct device_node *of_find_node_by_name(struct device_node *from,
-	const char *name);
-#define for_each_node_by_name(dn, name) \
-	for (dn = of_find_node_by_name(NULL, name); dn; \
-	     dn = of_find_node_by_name(dn, name))
-extern struct device_node *of_find_node_by_type(struct device_node *from,
-	const char *type);
-#define for_each_node_by_type(dn, type) \
-	for (dn = of_find_node_by_type(NULL, type); dn; \
-	     dn = of_find_node_by_type(dn, type))
-extern struct device_node *of_find_compatible_node(struct device_node *from,
-	const char *type, const char *compat);
-#define for_each_compatible_node(dn, type, compatible) \
-	for (dn = of_find_compatible_node(NULL, type, compatible); dn; \
-	     dn = of_find_compatible_node(dn, type, compatible))
-extern struct device_node *of_find_node_by_path(const char *path);
-extern struct device_node *of_find_node_by_phandle(phandle handle);
 extern struct device_node *of_find_all_nodes(struct device_node *prev);
-extern struct device_node *of_get_parent(const struct device_node *node);
-extern struct device_node *of_get_next_child(const struct device_node *node,
-					     struct device_node *prev);
-extern struct property *of_find_property(const struct device_node *np,
-					 const char *name,
-					 int *lenp);
 extern struct device_node *of_node_get(struct device_node *node);
 extern void of_node_put(struct device_node *node);
 
@@ -166,17 +138,9 @@ extern void of_detach_node(const struct device_node *);
 extern void finish_device_tree(void);
 extern void unflatten_device_tree(void);
 extern void early_init_devtree(void *);
-extern int of_device_is_compatible(const struct device_node *device,
-				const char *);
 #define device_is_compatible(d, c)	of_device_is_compatible((d), (c))
 extern int machine_is_compatible(const char *compat);
-extern const void *of_get_property(const struct device_node *node,
-				const char *name,
-				int *lenp);
-#define get_property(a, b, c)	of_get_property((a), (b), (c))
 extern void print_properties(struct device_node *node);
-extern int of_n_addr_cells(struct device_node* np);
-extern int of_n_size_cells(struct device_node* np);
 extern int prom_n_intr_cells(struct device_node* np);
 extern void prom_get_irq_senses(unsigned char *senses, int off, int max);
 extern int prom_add_property(struct device_node* np, struct property* prop);
@@ -230,7 +194,6 @@ static inline unsigned long of_read_ulong(const u32 *cell, int size)
 
 /* Translate an OF address block into a CPU physical address
  */
-#define OF_BAD_ADDR	((u64)-1)
 extern u64 of_translate_address(struct device_node *np, const u32 *addr);
 
 /* Extract an address from a device, returns the region size and
@@ -357,5 +320,11 @@ extern int of_irq_to_resource(struct device_node *dev, int index,
  */
 extern void __iomem *of_iomap(struct device_node *device, int index);
 
+/*
+ * NB:  This is here while we transition from using asm/prom.h
+ * to linux/of.h
+ */
+#include <linux/of.h>
+
 #endif /* __KERNEL__ */
 #endif /* _POWERPC_PROM_H */
diff --git a/include/asm-sparc/prom.h b/include/asm-sparc/prom.h
index 9ea105ebe2ff..d67af08abb43 100644
--- a/include/asm-sparc/prom.h
+++ b/include/asm-sparc/prom.h
@@ -2,7 +2,6 @@
 #define _SPARC_PROM_H
 #ifdef __KERNEL__
 
-
 /*
  * Definitions for talking to the Open Firmware PROM on
  * Power Macintosh computers.
@@ -17,7 +16,6 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
-
 #include <linux/types.h>
 #include <linux/proc_fs.h>
 #include <asm/atomic.h>
@@ -55,53 +53,21 @@ struct device_node {
 	unsigned int unique_id;
 };
 
-/* flag descriptions */
-#define OF_DYNAMIC 1 /* node and properties were allocated via kmalloc */
-
 #define OF_IS_DYNAMIC(x) test_bit(OF_DYNAMIC, &x->_flags)
 #define OF_MARK_DYNAMIC(x) set_bit(OF_DYNAMIC, &x->_flags)
 
-#define OF_BAD_ADDR	((u64)-1)
-
-static inline void set_node_proc_entry(struct device_node *dn, struct proc_dir_entry *de)
-{
-	dn->pde = de;
-}
-
-extern struct device_node *of_find_node_by_name(struct device_node *from,
-	const char *name);
-#define for_each_node_by_name(dn, name) \
-	for (dn = of_find_node_by_name(NULL, name); dn; \
-	     dn = of_find_node_by_name(dn, name))
-extern struct device_node *of_find_node_by_type(struct device_node *from,
-	const char *type);
-#define for_each_node_by_type(dn, type) \
-	for (dn = of_find_node_by_type(NULL, type); dn; \
-	     dn = of_find_node_by_type(dn, type))
-extern struct device_node *of_find_compatible_node(struct device_node *from,
-	const char *type, const char *compat);
-extern struct device_node *of_find_node_by_path(const char *path);
-extern struct device_node *of_find_node_by_phandle(phandle handle);
-extern struct device_node *of_get_parent(const struct device_node *node);
-extern struct device_node *of_get_next_child(const struct device_node *node,
-					     struct device_node *prev);
-extern struct property *of_find_property(const struct device_node *np,
-					 const char *name,
-					 int *lenp);
-extern int of_device_is_compatible(const struct device_node *device,
-				   const char *);
-extern const void *of_get_property(const struct device_node *node,
-				   const char *name,
-				   int *lenp);
-#define get_property(node,name,lenp) of_get_property(node,name,lenp)
 extern int of_set_property(struct device_node *node, const char *name, void *val, int len);
 extern int of_getintprop_default(struct device_node *np,
 				 const char *name,
 				 int def);
-extern int of_n_addr_cells(struct device_node *np);
-extern int of_n_size_cells(struct device_node *np);
 
 extern void prom_build_devicetree(void);
 
+/*
+ * NB:  This is here while we transition from using asm/prom.h
+ * to linux/of.h
+ */
+#include <linux/of.h>
+
 #endif /* __KERNEL__ */
 #endif /* _SPARC_PROM_H */
diff --git a/include/asm-sparc64/prom.h b/include/asm-sparc64/prom.h
index b4df3042add0..9905ed057d93 100644
--- a/include/asm-sparc64/prom.h
+++ b/include/asm-sparc64/prom.h
@@ -2,7 +2,6 @@
 #define _SPARC64_PROM_H
 #ifdef __KERNEL__
 
-
 /*
  * Definitions for talking to the Open Firmware PROM on
  * Power Macintosh computers.
@@ -17,7 +16,6 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
-
 #include <linux/types.h>
 #include <linux/proc_fs.h>
 #include <asm/atomic.h>
@@ -63,54 +61,22 @@ struct of_irq_controller {
 	void		*data;
 };
 
-/* flag descriptions */
-#define OF_DYNAMIC 1 /* node and properties were allocated via kmalloc */
-
 #define OF_IS_DYNAMIC(x) test_bit(OF_DYNAMIC, &x->_flags)
 #define OF_MARK_DYNAMIC(x) set_bit(OF_DYNAMIC, &x->_flags)
 
-#define OF_BAD_ADDR	((u64)-1)
-
-static inline void set_node_proc_entry(struct device_node *dn, struct proc_dir_entry *de)
-{
-	dn->pde = de;
-}
-
-extern struct device_node *of_find_node_by_name(struct device_node *from,
-	const char *name);
-#define for_each_node_by_name(dn, name) \
-	for (dn = of_find_node_by_name(NULL, name); dn; \
-	     dn = of_find_node_by_name(dn, name))
-extern struct device_node *of_find_node_by_type(struct device_node *from,
-	const char *type);
-#define for_each_node_by_type(dn, type) \
-	for (dn = of_find_node_by_type(NULL, type); dn; \
-	     dn = of_find_node_by_type(dn, type))
-extern struct device_node *of_find_compatible_node(struct device_node *from,
-	const char *type, const char *compat);
-extern struct device_node *of_find_node_by_path(const char *path);
-extern struct device_node *of_find_node_by_phandle(phandle handle);
 extern struct device_node *of_find_node_by_cpuid(int cpuid);
-extern struct device_node *of_get_parent(const struct device_node *node);
-extern struct device_node *of_get_next_child(const struct device_node *node,
-					     struct device_node *prev);
-extern struct property *of_find_property(const struct device_node *np,
-					 const char *name,
-					 int *lenp);
-extern int of_device_is_compatible(const struct device_node *device,
-				   const char *);
-extern const void *of_get_property(const struct device_node *node,
-			     const char *name,
-			     int *lenp);
-#define get_property(node,name,lenp) of_get_property(node,name,lenp)
 extern int of_set_property(struct device_node *node, const char *name, void *val, int len);
 extern int of_getintprop_default(struct device_node *np,
 				 const char *name,
 				 int def);
-extern int of_n_addr_cells(struct device_node *np);
-extern int of_n_size_cells(struct device_node *np);
 
 extern void prom_build_devicetree(void);
 
+/*
+ * NB:  This is here while we transition from using asm/prom.h
+ * to linux/of.h
+ */
+#include <linux/of.h>
+
 #endif /* __KERNEL__ */
 #endif /* _SPARC64_PROM_H */
diff --git a/include/linux/of.h b/include/linux/of.h
new file mode 100644
index 000000000000..47734ffd9745
--- /dev/null
+++ b/include/linux/of.h
@@ -0,0 +1,61 @@
+#ifndef _LINUX_OF_H
+#define _LINUX_OF_H
+/*
+ * Definitions for talking to the Open Firmware PROM on
+ * Power Macintosh and other computers.
+ *
+ * Copyright (C) 1996-2005 Paul Mackerras.
+ *
+ * Updates for PPC64 by Peter Bergner & David Engebretsen, IBM Corp.
+ * Updates for SPARC64 by David S. Miller
+ * Derived from PowerPC and Sparc prom.h files by Stephen Rothwell, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/types.h>
+
+#include <asm/bitops.h>
+#include <asm/prom.h>
+
+/* flag descriptions */
+#define OF_DYNAMIC	1 /* node and properties were allocated via kmalloc */
+#define OF_DETACHED	2 /* node has been detached from the device tree */
+
+#define OF_BAD_ADDR	((u64)-1)
+
+extern struct device_node *of_find_node_by_name(struct device_node *from,
+	const char *name);
+#define for_each_node_by_name(dn, name) \
+	for (dn = of_find_node_by_name(NULL, name); dn; \
+	     dn = of_find_node_by_name(dn, name))
+extern struct device_node *of_find_node_by_type(struct device_node *from,
+	const char *type);
+#define for_each_node_by_type(dn, type) \
+	for (dn = of_find_node_by_type(NULL, type); dn; \
+	     dn = of_find_node_by_type(dn, type))
+extern struct device_node *of_find_compatible_node(struct device_node *from,
+	const char *type, const char *compat);
+#define for_each_compatible_node(dn, type, compatible) \
+	for (dn = of_find_compatible_node(NULL, type, compatible); dn; \
+	     dn = of_find_compatible_node(dn, type, compatible))
+extern struct device_node *of_find_node_by_path(const char *path);
+extern struct device_node *of_find_node_by_phandle(phandle handle);
+extern struct device_node *of_get_parent(const struct device_node *node);
+extern struct device_node *of_get_next_child(const struct device_node *node,
+					     struct device_node *prev);
+extern struct property *of_find_property(const struct device_node *np,
+					 const char *name,
+					 int *lenp);
+extern int of_device_is_compatible(const struct device_node *device,
+				   const char *);
+extern const void *of_get_property(const struct device_node *node,
+				const char *name,
+				int *lenp);
+#define get_property(a, b, c)	of_get_property((a), (b), (c))
+extern int of_n_addr_cells(struct device_node *np);
+extern int of_n_size_cells(struct device_node *np);
+
+#endif /* _LINUX_OF_H */
-- 
cgit v1.3-8-gc7d7


From f898f8dbcec4848cddb8c5be2d0affd75779ebe2 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 1 May 2007 16:49:51 +1000
Subject: Begin consolidation of of_device.h

This just moves the common stuff from the arch of_device.h files to
linux/of_device.h.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Paul Mackerras <paulus@samba.org>
Acked-by: David S. Miller <davem@davemloft.net>
---
 drivers/of/device.c             |  2 +-
 include/asm-powerpc/of_device.h | 22 +++++-----------------
 include/asm-sparc/of_device.h   | 15 ++++-----------
 include/asm-sparc64/of_device.h | 15 ++++-----------
 include/linux/of_device.h       | 26 ++++++++++++++++++++++++++
 5 files changed, 40 insertions(+), 40 deletions(-)
 create mode 100644 include/linux/of_device.h

(limited to 'include/linux')

diff --git a/drivers/of/device.c b/drivers/of/device.c
index 7f233d77d62f..6245f060fb77 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -1,13 +1,13 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/of.h>
+#include <linux/of_device.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
 #include <linux/slab.h>
 
 #include <asm/errno.h>
-#include <asm/of_device.h>
 
 /**
  * of_match_node - Tell if an device_node has a matching of_match structure
diff --git a/include/asm-powerpc/of_device.h b/include/asm-powerpc/of_device.h
index e9af49eb1aa8..ec2a8a2c737c 100644
--- a/include/asm-powerpc/of_device.h
+++ b/include/asm-powerpc/of_device.h
@@ -3,14 +3,12 @@
 #ifdef __KERNEL__
 
 #include <linux/device.h>
-#include <linux/mod_devicetable.h>
-#include <asm/prom.h>
-
+#include <linux/of.h>
 
 /*
  * The of_device is a kind of "base class" that is a superset of
  * struct device for use by devices attached to an OF node and
- * probed using OF properties
+ * probed using OF properties.
  */
 struct of_device
 {
@@ -18,24 +16,14 @@ struct of_device
 	u64			dma_mask;	/* DMA mask */
 	struct device		dev;		/* Generic device interface */
 };
-#define	to_of_device(d) container_of(d, struct of_device, dev)
-
-extern const struct of_device_id *of_match_node(
-	const struct of_device_id *matches, const struct device_node *node);
-extern const struct of_device_id *of_match_device(
-	const struct of_device_id *matches, const struct of_device *dev);
-
-extern struct of_device *of_dev_get(struct of_device *dev);
-extern void of_dev_put(struct of_device *dev);
-
-extern int of_device_register(struct of_device *ofdev);
-extern void of_device_unregister(struct of_device *ofdev);
-extern void of_release_dev(struct device *dev);
 
 extern ssize_t of_device_get_modalias(struct of_device *ofdev,
 					char *str, ssize_t len);
 extern int of_device_uevent(struct device *dev,
 	char **envp, int num_envp, char *buffer, int buffer_size);
 
+/* This is just here during the transition */
+#include <linux/of_device.h>
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_OF_DEVICE_H */
diff --git a/include/asm-sparc/of_device.h b/include/asm-sparc/of_device.h
index 7cb00c1b09c6..b625261c9ec7 100644
--- a/include/asm-sparc/of_device.h
+++ b/include/asm-sparc/of_device.h
@@ -3,9 +3,9 @@
 #ifdef __KERNEL__
 
 #include <linux/device.h>
+#include <linux/of.h>
 #include <linux/mod_devicetable.h>
 #include <asm/openprom.h>
-#include <asm/prom.h>
 
 extern struct bus_type ebus_bus_type;
 extern struct bus_type sbus_bus_type;
@@ -30,19 +30,12 @@ struct of_device
 	int				portid;
 	int				clock_freq;
 };
-#define	to_of_device(d) container_of(d, struct of_device, dev)
 
 extern void __iomem *of_ioremap(struct resource *res, unsigned long offset, unsigned long size, char *name);
 extern void of_iounmap(struct resource *res, void __iomem *base, unsigned long size);
 
 extern struct of_device *of_find_device_by_node(struct device_node *);
 
-extern const struct of_device_id *of_match_device(
-	const struct of_device_id *matches, const struct of_device *dev);
-
-extern struct of_device *of_dev_get(struct of_device *dev);
-extern void of_dev_put(struct of_device *dev);
-
 /*
  * An of_platform_driver driver is attached to a basic of_device on
  * the ISA, EBUS, and SBUS busses on sparc64.
@@ -67,13 +60,13 @@ struct of_platform_driver
 extern int of_register_driver(struct of_platform_driver *drv,
 			      struct bus_type *bus);
 extern void of_unregister_driver(struct of_platform_driver *drv);
-extern int of_device_register(struct of_device *ofdev);
-extern void of_device_unregister(struct of_device *ofdev);
 extern struct of_device *of_platform_device_create(struct device_node *np,
 						   const char *bus_id,
 						   struct device *parent,
 						   struct bus_type *bus);
-extern void of_release_dev(struct device *dev);
+
+/* This is just here during the transition */
+#include <linux/of_device.h>
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_SPARC_OF_DEVICE_H */
diff --git a/include/asm-sparc64/of_device.h b/include/asm-sparc64/of_device.h
index 60e9173c9acb..68048cb20688 100644
--- a/include/asm-sparc64/of_device.h
+++ b/include/asm-sparc64/of_device.h
@@ -3,9 +3,9 @@
 #ifdef __KERNEL__
 
 #include <linux/device.h>
+#include <linux/of.h>
 #include <linux/mod_devicetable.h>
 #include <asm/openprom.h>
-#include <asm/prom.h>
 
 extern struct bus_type isa_bus_type;
 extern struct bus_type ebus_bus_type;
@@ -31,19 +31,12 @@ struct of_device
 	int				portid;
 	int				clock_freq;
 };
-#define	to_of_device(d) container_of(d, struct of_device, dev)
 
 extern void __iomem *of_ioremap(struct resource *res, unsigned long offset, unsigned long size, char *name);
 extern void of_iounmap(struct resource *res, void __iomem *base, unsigned long size);
 
 extern struct of_device *of_find_device_by_node(struct device_node *);
 
-extern const struct of_device_id *of_match_device(
-	const struct of_device_id *matches, const struct of_device *dev);
-
-extern struct of_device *of_dev_get(struct of_device *dev);
-extern void of_dev_put(struct of_device *dev);
-
 /*
  * An of_platform_driver driver is attached to a basic of_device on
  * the ISA, EBUS, and SBUS busses on sparc64.
@@ -68,13 +61,13 @@ struct of_platform_driver
 extern int of_register_driver(struct of_platform_driver *drv,
 			      struct bus_type *bus);
 extern void of_unregister_driver(struct of_platform_driver *drv);
-extern int of_device_register(struct of_device *ofdev);
-extern void of_device_unregister(struct of_device *ofdev);
 extern struct of_device *of_platform_device_create(struct device_node *np,
 						   const char *bus_id,
 						   struct device *parent,
 						   struct bus_type *bus);
-extern void of_release_dev(struct device *dev);
+
+/* This is just here during the transition */
+#include <linux/of_device.h>
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_SPARC64_OF_DEVICE_H */
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
new file mode 100644
index 000000000000..91bf84b9d144
--- /dev/null
+++ b/include/linux/of_device.h
@@ -0,0 +1,26 @@
+#ifndef _LINUX_OF_DEVICE_H
+#define _LINUX_OF_DEVICE_H
+#ifdef __KERNEL__
+
+#include <linux/device.h>
+#include <linux/of.h>
+#include <linux/mod_devicetable.h>
+
+#include <asm/of_device.h>
+
+#define	to_of_device(d) container_of(d, struct of_device, dev)
+
+extern const struct of_device_id *of_match_node(
+	const struct of_device_id *matches, const struct device_node *node);
+extern const struct of_device_id *of_match_device(
+	const struct of_device_id *matches, const struct of_device *dev);
+
+extern struct of_device *of_dev_get(struct of_device *dev);
+extern void of_dev_put(struct of_device *dev);
+
+extern int of_device_register(struct of_device *ofdev);
+extern void of_device_unregister(struct of_device *ofdev);
+extern void of_release_dev(struct device *dev);
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_OF_DEVICE_H */
-- 
cgit v1.3-8-gc7d7


From b41912ca345e6de8ec8469d57cd585881271e2b9 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 1 May 2007 16:12:57 +1000
Subject: Create linux/of_platorm.h

Move common stuff from asm-powerpc/of_platform.h to here and
move the common bits from asm-sparc*/of_device.h here as well.

Create asm-sparc*/of_platform.h and move appropriate parts of
of_device.h to them.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Paul Mackerras <paulus@samba.org>
Acked-by: David S. Miller <davem@davemloft.net>
---
 include/asm-powerpc/of_platform.h | 38 +++++----------------------
 include/asm-sparc/of_device.h     | 39 ++-------------------------
 include/asm-sparc/of_platform.h   | 32 +++++++++++++++++++++++
 include/asm-sparc64/of_device.h   | 40 ++--------------------------
 include/asm-sparc64/of_platform.h | 33 +++++++++++++++++++++++
 include/linux/of_platform.h       | 55 +++++++++++++++++++++++++++++++++++++++
 6 files changed, 130 insertions(+), 107 deletions(-)
 create mode 100644 include/asm-sparc/of_platform.h
 create mode 100644 include/asm-sparc64/of_platform.h
 create mode 100644 include/linux/of_platform.h

(limited to 'include/linux')

diff --git a/include/asm-powerpc/of_platform.h b/include/asm-powerpc/of_platform.h
index 217eafb167e9..80e6fad28b4f 100644
--- a/include/asm-powerpc/of_platform.h
+++ b/include/asm-powerpc/of_platform.h
@@ -1,3 +1,5 @@
+#ifndef _ASM_POWERPC_OF_PLATFORM_H
+#define _ASM_POWERPC_OF_PLATFORM_H
 /*
  *    Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corp.
  *			 <benh@kernel.crashing.org>
@@ -9,37 +11,8 @@
  *
  */
 
-#include <asm/of_device.h>
-
-/*
- * The of_platform_bus_type is a bus type used by drivers that do not
- * attach to a macio or similar bus but still use OF probing
- * mechanism
- */
-extern struct bus_type of_platform_bus_type;
-
-/*
- * An of_platform_driver driver is attached to a basic of_device on
- * the "platform bus" (of_platform_bus_type)
- */
-struct of_platform_driver
-{
-	char			*name;
-	struct of_device_id	*match_table;
-	struct module		*owner;
-
-	int	(*probe)(struct of_device* dev,
-			 const struct of_device_id *match);
-	int	(*remove)(struct of_device* dev);
-
-	int	(*suspend)(struct of_device* dev, pm_message_t state);
-	int	(*resume)(struct of_device* dev);
-	int	(*shutdown)(struct of_device* dev);
-
-	struct device_driver	driver;
-};
-#define	to_of_platform_driver(drv) \
-	container_of(drv,struct of_platform_driver, driver)
+/* This is just here during the transition */
+#include <linux/of_platform.h>
 
 /* Platform drivers register/unregister */
 extern int of_register_platform_driver(struct of_platform_driver *drv);
@@ -56,5 +29,6 @@ extern int of_platform_bus_probe(struct device_node *root,
 				 struct of_device_id *matches,
 				 struct device *parent);
 
-extern struct of_device *of_find_device_by_node(struct device_node *np);
 extern struct of_device *of_find_device_by_phandle(phandle ph);
+
+#endif	/* _ASM_POWERPC_OF_PLATFORM_H */
diff --git a/include/asm-sparc/of_device.h b/include/asm-sparc/of_device.h
index fd8e95743139..e5f5aedc2293 100644
--- a/include/asm-sparc/of_device.h
+++ b/include/asm-sparc/of_device.h
@@ -7,11 +7,6 @@
 #include <linux/mod_devicetable.h>
 #include <asm/openprom.h>
 
-extern struct bus_type ebus_bus_type;
-extern struct bus_type sbus_bus_type;
-extern struct bus_type of_platform_bus_type;
-#define of_bus_type	of_platform_bus_type	/* for compatibility */
-
 /*
  * The of_device is a kind of "base class" that is a superset of
  * struct device for use by devices attached to an OF node and
@@ -35,39 +30,9 @@ struct of_device
 extern void __iomem *of_ioremap(struct resource *res, unsigned long offset, unsigned long size, char *name);
 extern void of_iounmap(struct resource *res, void __iomem *base, unsigned long size);
 
-extern struct of_device *of_find_device_by_node(struct device_node *);
-
-/*
- * An of_platform_driver driver is attached to a basic of_device on
- * the ISA, EBUS, and SBUS busses on sparc64.
- */
-struct of_platform_driver
-{
-	char			*name;
-	struct of_device_id	*match_table;
-	struct module		*owner;
-
-	int	(*probe)(struct of_device* dev, const struct of_device_id *match);
-	int	(*remove)(struct of_device* dev);
-
-	int	(*suspend)(struct of_device* dev, pm_message_t state);
-	int	(*resume)(struct of_device* dev);
-	int	(*shutdown)(struct of_device* dev);
-
-	struct device_driver	driver;
-};
-#define	to_of_platform_driver(drv) container_of(drv,struct of_platform_driver, driver)
-
-extern int of_register_driver(struct of_platform_driver *drv,
-			      struct bus_type *bus);
-extern void of_unregister_driver(struct of_platform_driver *drv);
-extern struct of_device *of_platform_device_create(struct device_node *np,
-						   const char *bus_id,
-						   struct device *parent,
-						   struct bus_type *bus);
-
-/* This is just here during the transition */
+/* These are just here during the transition */
 #include <linux/of_device.h>
+#include <linux/of_platform.h>
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_SPARC_OF_DEVICE_H */
diff --git a/include/asm-sparc/of_platform.h b/include/asm-sparc/of_platform.h
new file mode 100644
index 000000000000..64a230064ef2
--- /dev/null
+++ b/include/asm-sparc/of_platform.h
@@ -0,0 +1,32 @@
+#ifndef _ASM_SPARC_OF_PLATFORM_H
+#define _ASM_SPARC_OF_PLATFORM_H
+/*
+ *    Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corp.
+ *			 <benh@kernel.crashing.org>
+ *    Modified for Sparc by merging parts of asm-sparc/of_device.h
+ *		by Stephen Rothwell
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+/* This is just here during the transition */
+#include <linux/of_platform.h>
+
+extern struct bus_type ebus_bus_type;
+extern struct bus_type sbus_bus_type;
+extern struct bus_type of_platform_bus_type;
+#define of_bus_type	of_platform_bus_type	/* for compatibility */
+
+extern int of_register_driver(struct of_platform_driver *drv,
+			      struct bus_type *bus);
+extern void of_unregister_driver(struct of_platform_driver *drv);
+extern struct of_device *of_platform_device_create(struct device_node *np,
+						   const char *bus_id,
+						   struct device *parent,
+						   struct bus_type *bus);
+
+#endif	/* _ASM_SPARC_OF_PLATFORM_H */
diff --git a/include/asm-sparc64/of_device.h b/include/asm-sparc64/of_device.h
index daf36eb52292..46d69b3223c5 100644
--- a/include/asm-sparc64/of_device.h
+++ b/include/asm-sparc64/of_device.h
@@ -7,12 +7,6 @@
 #include <linux/mod_devicetable.h>
 #include <asm/openprom.h>
 
-extern struct bus_type isa_bus_type;
-extern struct bus_type ebus_bus_type;
-extern struct bus_type sbus_bus_type;
-extern struct bus_type of_platform_bus_type;
-#define of_bus_type	of_platform_bus_type	/* for compatibility */
-
 /*
  * The of_device is a kind of "base class" that is a superset of
  * struct device for use by devices attached to an OF node and
@@ -36,39 +30,9 @@ struct of_device
 extern void __iomem *of_ioremap(struct resource *res, unsigned long offset, unsigned long size, char *name);
 extern void of_iounmap(struct resource *res, void __iomem *base, unsigned long size);
 
-extern struct of_device *of_find_device_by_node(struct device_node *);
-
-/*
- * An of_platform_driver driver is attached to a basic of_device on
- * the ISA, EBUS, and SBUS busses on sparc64.
- */
-struct of_platform_driver
-{
-	char			*name;
-	struct of_device_id	*match_table;
-	struct module		*owner;
-
-	int	(*probe)(struct of_device* dev, const struct of_device_id *match);
-	int	(*remove)(struct of_device* dev);
-
-	int	(*suspend)(struct of_device* dev, pm_message_t state);
-	int	(*resume)(struct of_device* dev);
-	int	(*shutdown)(struct of_device* dev);
-
-	struct device_driver	driver;
-};
-#define	to_of_platform_driver(drv) container_of(drv,struct of_platform_driver, driver)
-
-extern int of_register_driver(struct of_platform_driver *drv,
-			      struct bus_type *bus);
-extern void of_unregister_driver(struct of_platform_driver *drv);
-extern struct of_device *of_platform_device_create(struct device_node *np,
-						   const char *bus_id,
-						   struct device *parent,
-						   struct bus_type *bus);
-
-/* This is just here during the transition */
+/* These are just here during the transition */
 #include <linux/of_device.h>
+#include <linux/of_platform.h>
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_SPARC64_OF_DEVICE_H */
diff --git a/include/asm-sparc64/of_platform.h b/include/asm-sparc64/of_platform.h
new file mode 100644
index 000000000000..f7c1f17c7d52
--- /dev/null
+++ b/include/asm-sparc64/of_platform.h
@@ -0,0 +1,33 @@
+#ifndef _ASM_SPARC64_OF_PLATFORM_H
+#define _ASM_SPARC64_OF_PLATFORM_H
+/*
+ *    Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corp.
+ *			 <benh@kernel.crashing.org>
+ *    Modified for Sparc by merging parts of asm-sparc/of_device.h
+ *		by Stephen Rothwell
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+/* This is just here during the transition */
+#include <linux/of_platform.h>
+
+extern struct bus_type isa_bus_type;
+extern struct bus_type ebus_bus_type;
+extern struct bus_type sbus_bus_type;
+extern struct bus_type of_platform_bus_type;
+#define of_bus_type	of_platform_bus_type	/* for compatibility */
+
+extern int of_register_driver(struct of_platform_driver *drv,
+			      struct bus_type *bus);
+extern void of_unregister_driver(struct of_platform_driver *drv);
+extern struct of_device *of_platform_device_create(struct device_node *np,
+						   const char *bus_id,
+						   struct device *parent,
+						   struct bus_type *bus);
+
+#endif	/* _ASM_SPARC64_OF_PLATFORM_H */
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
new file mode 100644
index 000000000000..c85d0f835783
--- /dev/null
+++ b/include/linux/of_platform.h
@@ -0,0 +1,55 @@
+#ifndef _LINUX_OF_PLATFORM_H
+#define _LINUX_OF_PLATFORM_H
+/*
+ *    Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corp.
+ *			 <benh@kernel.crashing.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/mod_devicetable.h>
+#include <linux/pm.h>
+#include <linux/of_device.h>
+
+/*
+ * The of_platform_bus_type is a bus type used by drivers that do not
+ * attach to a macio or similar bus but still use OF probing
+ * mechanism
+ */
+extern struct bus_type of_platform_bus_type;
+
+/*
+ * An of_platform_driver driver is attached to a basic of_device on
+ * the "platform bus" (of_platform_bus_type) (or ISA, EBUS and SBUS
+ * busses on sparc).
+ */
+struct of_platform_driver
+{
+	char			*name;
+	struct of_device_id	*match_table;
+	struct module		*owner;
+
+	int	(*probe)(struct of_device* dev,
+			 const struct of_device_id *match);
+	int	(*remove)(struct of_device* dev);
+
+	int	(*suspend)(struct of_device* dev, pm_message_t state);
+	int	(*resume)(struct of_device* dev);
+	int	(*shutdown)(struct of_device* dev);
+
+	struct device_driver	driver;
+};
+#define	to_of_platform_driver(drv) \
+	container_of(drv,struct of_platform_driver, driver)
+
+#include <asm/of_platform.h>
+
+extern struct of_device *of_find_device_by_node(struct device_node *np);
+
+#endif	/* _LINUX_OF_PLATFORM_H */
-- 
cgit v1.3-8-gc7d7


From 3f23de10f283819bcdc0d2282e8b5b14c2e96d3b Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 3 May 2007 02:38:57 +1000
Subject: Create drivers/of/platform.c

and populate it with the common parts from PowerPC and Sparc[64].

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Paul Mackerras <paulus@samba.org>
Acked-by: David S. Miller <davem@davemloft.net>
---
 arch/powerpc/kernel/of_platform.c |  82 +-------------------------
 arch/sparc/kernel/of_device.c     | 107 +++-------------------------------
 arch/sparc64/kernel/of_device.c   | 118 ++++----------------------------------
 drivers/of/Makefile               |   2 +-
 drivers/of/platform.c             |  96 +++++++++++++++++++++++++++++++
 include/linux/of_platform.h       |   2 +
 6 files changed, 120 insertions(+), 287 deletions(-)
 create mode 100644 drivers/of/platform.c

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c
index 8ded4e7dc87e..f70e787d556f 100644
--- a/arch/powerpc/kernel/of_platform.c
+++ b/arch/powerpc/kernel/of_platform.c
@@ -55,94 +55,14 @@ static struct of_device_id of_default_bus_ids[] = {
 
 static atomic_t bus_no_reg_magic;
 
-/*
- *
- * OF platform device type definition & base infrastructure
- *
- */
-
-static int of_platform_bus_match(struct device *dev, struct device_driver *drv)
-{
-	struct of_device * of_dev = to_of_device(dev);
-	struct of_platform_driver * of_drv = to_of_platform_driver(drv);
-	const struct of_device_id * matches = of_drv->match_table;
-
-	if (!matches)
-		return 0;
-
-	return of_match_device(matches, of_dev) != NULL;
-}
-
-static int of_platform_device_probe(struct device *dev)
-{
-	int error = -ENODEV;
-	struct of_platform_driver *drv;
-	struct of_device *of_dev;
-	const struct of_device_id *match;
-
-	drv = to_of_platform_driver(dev->driver);
-	of_dev = to_of_device(dev);
-
-	if (!drv->probe)
-		return error;
-
-	of_dev_get(of_dev);
-
-	match = of_match_device(drv->match_table, of_dev);
-	if (match)
-		error = drv->probe(of_dev, match);
-	if (error)
-		of_dev_put(of_dev);
-
-	return error;
-}
-
-static int of_platform_device_remove(struct device *dev)
-{
-	struct of_device * of_dev = to_of_device(dev);
-	struct of_platform_driver * drv = to_of_platform_driver(dev->driver);
-
-	if (dev->driver && drv->remove)
-		drv->remove(of_dev);
-	return 0;
-}
-
-static int of_platform_device_suspend(struct device *dev, pm_message_t state)
-{
-	struct of_device * of_dev = to_of_device(dev);
-	struct of_platform_driver * drv = to_of_platform_driver(dev->driver);
-	int error = 0;
-
-	if (dev->driver && drv->suspend)
-		error = drv->suspend(of_dev, state);
-	return error;
-}
-
-static int of_platform_device_resume(struct device * dev)
-{
-	struct of_device * of_dev = to_of_device(dev);
-	struct of_platform_driver * drv = to_of_platform_driver(dev->driver);
-	int error = 0;
-
-	if (dev->driver && drv->resume)
-		error = drv->resume(of_dev);
-	return error;
-}
-
 struct bus_type of_platform_bus_type = {
-       .name	= "of_platform",
-       .match	= of_platform_bus_match,
        .uevent	= of_device_uevent,
-       .probe	= of_platform_device_probe,
-       .remove	= of_platform_device_remove,
-       .suspend	= of_platform_device_suspend,
-       .resume	= of_platform_device_resume,
 };
 EXPORT_SYMBOL(of_platform_bus_type);
 
 static int __init of_bus_driver_init(void)
 {
-	return bus_register(&of_platform_bus_type);
+	return of_bus_type_init(&of_platform_bus_type, "of_platform");
 }
 
 postcore_initcall(of_bus_driver_init);
diff --git a/arch/sparc/kernel/of_device.c b/arch/sparc/kernel/of_device.c
index cb21983cff9f..7176040caba0 100644
--- a/arch/sparc/kernel/of_device.c
+++ b/arch/sparc/kernel/of_device.c
@@ -5,77 +5,9 @@
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
 #include <linux/slab.h>
-
-#include <asm/errno.h>
-#include <asm/of_device.h>
-
-static int of_platform_bus_match(struct device *dev, struct device_driver *drv)
-{
-	struct of_device * of_dev = to_of_device(dev);
-	struct of_platform_driver * of_drv = to_of_platform_driver(drv);
-	const struct of_device_id * matches = of_drv->match_table;
-
-	if (!matches)
-		return 0;
-
-	return of_match_device(matches, of_dev) != NULL;
-}
-
-static int of_platform_device_probe(struct device *dev)
-{
-	int error = -ENODEV;
-	struct of_platform_driver *drv;
-	struct of_device *of_dev;
-	const struct of_device_id *match;
-
-	drv = to_of_platform_driver(dev->driver);
-	of_dev = to_of_device(dev);
-
-	if (!drv->probe)
-		return error;
-
-	of_dev_get(of_dev);
-
-	match = of_match_device(drv->match_table, of_dev);
-	if (match)
-		error = drv->probe(of_dev, match);
-	if (error)
-		of_dev_put(of_dev);
-
-	return error;
-}
-
-static int of_platform_device_remove(struct device *dev)
-{
-	struct of_device * of_dev = to_of_device(dev);
-	struct of_platform_driver * drv = to_of_platform_driver(dev->driver);
-
-	if (dev->driver && drv->remove)
-		drv->remove(of_dev);
-	return 0;
-}
-
-static int of_platform_device_suspend(struct device *dev, pm_message_t state)
-{
-	struct of_device * of_dev = to_of_device(dev);
-	struct of_platform_driver * drv = to_of_platform_driver(dev->driver);
-	int error = 0;
-
-	if (dev->driver && drv->suspend)
-		error = drv->suspend(of_dev, state);
-	return error;
-}
-
-static int of_platform_device_resume(struct device * dev)
-{
-	struct of_device * of_dev = to_of_device(dev);
-	struct of_platform_driver * drv = to_of_platform_driver(dev->driver);
-	int error = 0;
-
-	if (dev->driver && drv->resume)
-		error = drv->resume(of_dev);
-	return error;
-}
+#include <linux/errno.h>
+#include <linux/of_device.h>
+#include <linux/of_platform.h>
 
 static int node_match(struct device *dev, void *data)
 {
@@ -98,37 +30,16 @@ struct of_device *of_find_device_by_node(struct device_node *dp)
 EXPORT_SYMBOL(of_find_device_by_node);
 
 #ifdef CONFIG_PCI
-struct bus_type ebus_bus_type = {
-       .name	= "ebus",
-       .match	= of_platform_bus_match,
-       .probe	= of_platform_device_probe,
-       .remove	= of_platform_device_remove,
-       .suspend	= of_platform_device_suspend,
-       .resume	= of_platform_device_resume,
-};
+struct bus_type ebus_bus_type;
 EXPORT_SYMBOL(ebus_bus_type);
 #endif
 
 #ifdef CONFIG_SBUS
-struct bus_type sbus_bus_type = {
-       .name	= "sbus",
-       .match	= of_platform_bus_match,
-       .probe	= of_platform_device_probe,
-       .remove	= of_platform_device_remove,
-       .suspend	= of_platform_device_suspend,
-       .resume	= of_platform_device_resume,
-};
+struct bus_type sbus_bus_type;
 EXPORT_SYMBOL(sbus_bus_type);
 #endif
 
-struct bus_type of_platform_bus_type = {
-       .name	= "of",
-       .match	= of_platform_bus_match,
-       .probe	= of_platform_device_probe,
-       .remove	= of_platform_device_remove,
-       .suspend	= of_platform_device_suspend,
-       .resume	= of_platform_device_resume,
-};
+struct bus_type of_platform_bus_type;
 EXPORT_SYMBOL(of_platform_bus_type);
 
 static inline u64 of_read_addr(const u32 *cell, int size)
@@ -639,14 +550,14 @@ static int __init of_bus_driver_init(void)
 {
 	int err;
 
-	err = bus_register(&of_platform_bus_type);
+	err = of_bus_type_init(&of_platform_bus_type, "of");
 #ifdef CONFIG_PCI
 	if (!err)
-		err = bus_register(&ebus_bus_type);
+		err = of_bus_type_init(&ebus_bus_type, "ebus");
 #endif
 #ifdef CONFIG_SBUS
 	if (!err)
-		err = bus_register(&sbus_bus_type);
+		err = of_bus_type_init(&sbus_bus_type, "sbus");
 #endif
 
 	if (!err)
diff --git a/arch/sparc64/kernel/of_device.c b/arch/sparc64/kernel/of_device.c
index 485f8579899a..7b0dce9604ee 100644
--- a/arch/sparc64/kernel/of_device.c
+++ b/arch/sparc64/kernel/of_device.c
@@ -5,77 +5,9 @@
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
 #include <linux/slab.h>
-
-#include <asm/errno.h>
-#include <asm/of_device.h>
-
-static int of_platform_bus_match(struct device *dev, struct device_driver *drv)
-{
-	struct of_device * of_dev = to_of_device(dev);
-	struct of_platform_driver * of_drv = to_of_platform_driver(drv);
-	const struct of_device_id * matches = of_drv->match_table;
-
-	if (!matches)
-		return 0;
-
-	return of_match_device(matches, of_dev) != NULL;
-}
-
-static int of_platform_device_probe(struct device *dev)
-{
-	int error = -ENODEV;
-	struct of_platform_driver *drv;
-	struct of_device *of_dev;
-	const struct of_device_id *match;
-
-	drv = to_of_platform_driver(dev->driver);
-	of_dev = to_of_device(dev);
-
-	if (!drv->probe)
-		return error;
-
-	of_dev_get(of_dev);
-
-	match = of_match_device(drv->match_table, of_dev);
-	if (match)
-		error = drv->probe(of_dev, match);
-	if (error)
-		of_dev_put(of_dev);
-
-	return error;
-}
-
-static int of_platform_device_remove(struct device *dev)
-{
-	struct of_device * of_dev = to_of_device(dev);
-	struct of_platform_driver * drv = to_of_platform_driver(dev->driver);
-
-	if (dev->driver && drv->remove)
-		drv->remove(of_dev);
-	return 0;
-}
-
-static int of_platform_device_suspend(struct device *dev, pm_message_t state)
-{
-	struct of_device * of_dev = to_of_device(dev);
-	struct of_platform_driver * drv = to_of_platform_driver(dev->driver);
-	int error = 0;
-
-	if (dev->driver && drv->suspend)
-		error = drv->suspend(of_dev, state);
-	return error;
-}
-
-static int of_platform_device_resume(struct device * dev)
-{
-	struct of_device * of_dev = to_of_device(dev);
-	struct of_platform_driver * drv = to_of_platform_driver(dev->driver);
-	int error = 0;
-
-	if (dev->driver && drv->resume)
-		error = drv->resume(of_dev);
-	return error;
-}
+#include <linux/errno.h>
+#include <linux/of_device.h>
+#include <linux/of_platform.h>
 
 void __iomem *of_ioremap(struct resource *res, unsigned long offset, unsigned long size, char *name)
 {
@@ -123,47 +55,19 @@ struct of_device *of_find_device_by_node(struct device_node *dp)
 EXPORT_SYMBOL(of_find_device_by_node);
 
 #ifdef CONFIG_PCI
-struct bus_type isa_bus_type = {
-       .name	= "isa",
-       .match	= of_platform_bus_match,
-       .probe	= of_platform_device_probe,
-       .remove	= of_platform_device_remove,
-       .suspend	= of_platform_device_suspend,
-       .resume	= of_platform_device_resume,
-};
+struct bus_type isa_bus_type;
 EXPORT_SYMBOL(isa_bus_type);
 
-struct bus_type ebus_bus_type = {
-       .name	= "ebus",
-       .match	= of_platform_bus_match,
-       .probe	= of_platform_device_probe,
-       .remove	= of_platform_device_remove,
-       .suspend	= of_platform_device_suspend,
-       .resume	= of_platform_device_resume,
-};
+struct bus_type ebus_bus_type;
 EXPORT_SYMBOL(ebus_bus_type);
 #endif
 
 #ifdef CONFIG_SBUS
-struct bus_type sbus_bus_type = {
-       .name	= "sbus",
-       .match	= of_platform_bus_match,
-       .probe	= of_platform_device_probe,
-       .remove	= of_platform_device_remove,
-       .suspend	= of_platform_device_suspend,
-       .resume	= of_platform_device_resume,
-};
+struct bus_type sbus_bus_type;
 EXPORT_SYMBOL(sbus_bus_type);
 #endif
 
-struct bus_type of_platform_bus_type = {
-       .name	= "of",
-       .match	= of_platform_bus_match,
-       .probe	= of_platform_device_probe,
-       .remove	= of_platform_device_remove,
-       .suspend	= of_platform_device_suspend,
-       .resume	= of_platform_device_resume,
-};
+struct bus_type of_platform_bus_type;
 EXPORT_SYMBOL(of_platform_bus_type);
 
 static inline u64 of_read_addr(const u32 *cell, int size)
@@ -926,16 +830,16 @@ static int __init of_bus_driver_init(void)
 {
 	int err;
 
-	err = bus_register(&of_platform_bus_type);
+	err = of_bus_type_init(&of_platform_bus_type, "of");
 #ifdef CONFIG_PCI
 	if (!err)
-		err = bus_register(&isa_bus_type);
+		err = of_bus_type_init(&isa_bus_type, "isa");
 	if (!err)
-		err = bus_register(&ebus_bus_type);
+		err = of_bus_type_init(&ebus_bus_type, "ebus");
 #endif
 #ifdef CONFIG_SBUS
 	if (!err)
-		err = bus_register(&sbus_bus_type);
+		err = of_bus_type_init(&sbus_bus_type, "sbus");
 #endif
 
 	if (!err)
diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index c46d998e367b..ab9be5d5255b 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -1,2 +1,2 @@
 obj-y = base.o
-obj-$(CONFIG_OF_DEVICE) += device.o
+obj-$(CONFIG_OF_DEVICE) += device.o platform.o
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
new file mode 100644
index 000000000000..864f09fd9f86
--- /dev/null
+++ b/drivers/of/platform.c
@@ -0,0 +1,96 @@
+/*
+ *    Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corp.
+ *			 <benh@kernel.crashing.org>
+ *    and		 Arnd Bergmann, IBM Corp.
+ *    Merged from powerpc/kernel/of_platform.c and
+ *    sparc{,64}/kernel/of_device.c by Stephen Rothwell
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+#include <linux/errno.h>
+#include <linux/device.h>
+#include <linux/of_device.h>
+#include <linux/of_platform.h>
+
+static int of_platform_bus_match(struct device *dev, struct device_driver *drv)
+{
+	struct of_device *of_dev = to_of_device(dev);
+	struct of_platform_driver *of_drv = to_of_platform_driver(drv);
+	const struct of_device_id *matches = of_drv->match_table;
+
+	if (!matches)
+		return 0;
+
+	return of_match_device(matches, of_dev) != NULL;
+}
+
+static int of_platform_device_probe(struct device *dev)
+{
+	int error = -ENODEV;
+	struct of_platform_driver *drv;
+	struct of_device *of_dev;
+	const struct of_device_id *match;
+
+	drv = to_of_platform_driver(dev->driver);
+	of_dev = to_of_device(dev);
+
+	if (!drv->probe)
+		return error;
+
+	of_dev_get(of_dev);
+
+	match = of_match_device(drv->match_table, of_dev);
+	if (match)
+		error = drv->probe(of_dev, match);
+	if (error)
+		of_dev_put(of_dev);
+
+	return error;
+}
+
+static int of_platform_device_remove(struct device *dev)
+{
+	struct of_device *of_dev = to_of_device(dev);
+	struct of_platform_driver *drv = to_of_platform_driver(dev->driver);
+
+	if (dev->driver && drv->remove)
+		drv->remove(of_dev);
+	return 0;
+}
+
+static int of_platform_device_suspend(struct device *dev, pm_message_t state)
+{
+	struct of_device *of_dev = to_of_device(dev);
+	struct of_platform_driver *drv = to_of_platform_driver(dev->driver);
+	int error = 0;
+
+	if (dev->driver && drv->suspend)
+		error = drv->suspend(of_dev, state);
+	return error;
+}
+
+static int of_platform_device_resume(struct device * dev)
+{
+	struct of_device *of_dev = to_of_device(dev);
+	struct of_platform_driver *drv = to_of_platform_driver(dev->driver);
+	int error = 0;
+
+	if (dev->driver && drv->resume)
+		error = drv->resume(of_dev);
+	return error;
+}
+
+int of_bus_type_init(struct bus_type *bus, const char *name)
+{
+	bus->name = name;
+	bus->match = of_platform_bus_match;
+	bus->probe = of_platform_device_probe;
+	bus->remove = of_platform_device_remove;
+	bus->suspend = of_platform_device_suspend;
+	bus->resume = of_platform_device_resume;
+	return bus_register(bus);
+}
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index c85d0f835783..5fd44e63fb26 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -52,4 +52,6 @@ struct of_platform_driver
 
 extern struct of_device *of_find_device_by_node(struct device_node *np);
 
+extern int of_bus_type_init(struct bus_type *bus, const char *name);
+
 #endif	/* _LINUX_OF_PLATFORM_H */
-- 
cgit v1.3-8-gc7d7


From c2dea2d1fdbce86942dba0a968c523d8b7858bb5 Mon Sep 17 00:00:00 2001
From: Vasily Tarasov <vtaras@openvz.org>
Date: Fri, 20 Jul 2007 10:06:38 +0200
Subject: cfq: async queue allocation per priority

If we have two processes with different ioprio_class, but the same
ioprio_data, their async requests will fall into the same queue. I guess
such behavior is not expected, because it's not right to put real-time
requests and best-effort requests in the same queue.

The attached patch fixes the problem by introducing additional *cfqq
fields on cfqd, pointing to per-(class,priority) async queues.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c    | 56 +++++++++++++++++++++++++++++++++++++++-----------
 include/linux/ioprio.h |  8 ++++++++
 2 files changed, 52 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9755a3cfad26..bc7190eed10d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -92,7 +92,11 @@ struct cfq_data {
 	struct cfq_queue *active_queue;
 	struct cfq_io_context *active_cic;
 
-	struct cfq_queue *async_cfqq[IOPRIO_BE_NR];
+	/*
+	 * async queue for each priority case
+	 */
+	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
+	struct cfq_queue *async_idle_cfqq;
 
 	struct timer_list idle_class_timer;
 
@@ -1414,24 +1418,44 @@ out:
 	return cfqq;
 }
 
+static struct cfq_queue **
+cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
+{
+	switch(ioprio_class) {
+	case IOPRIO_CLASS_RT:
+		return &cfqd->async_cfqq[0][ioprio];
+	case IOPRIO_CLASS_BE:
+		return &cfqd->async_cfqq[1][ioprio];
+	case IOPRIO_CLASS_IDLE:
+		return &cfqd->async_idle_cfqq;
+	default:
+		BUG();
+	}
+}
+
 static struct cfq_queue *
 cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct task_struct *tsk,
 	      gfp_t gfp_mask)
 {
 	const int ioprio = task_ioprio(tsk);
+	const int ioprio_class = task_ioprio_class(tsk);
+	struct cfq_queue **async_cfqq = NULL;
 	struct cfq_queue *cfqq = NULL;
 
-	if (!is_sync)
-		cfqq = cfqd->async_cfqq[ioprio];
+	if (!is_sync) {
+		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
+		cfqq = *async_cfqq;
+	}
+
 	if (!cfqq)
 		cfqq = cfq_find_alloc_queue(cfqd, is_sync, tsk, gfp_mask);
 
 	/*
 	 * pin the queue now that it's allocated, scheduler exit will prune it
 	 */
-	if (!is_sync && !cfqd->async_cfqq[ioprio]) {
+	if (!is_sync && !(*async_cfqq)) {
 		atomic_inc(&cfqq->ref);
-		cfqd->async_cfqq[ioprio] = cfqq;
+		*async_cfqq = cfqq;
 	}
 
 	atomic_inc(&cfqq->ref);
@@ -2042,11 +2066,24 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
 	blk_sync_queue(cfqd->queue);
 }
 
+static void cfq_put_async_queues(struct cfq_data *cfqd)
+{
+	int i;
+
+	for (i = 0; i < IOPRIO_BE_NR; i++) {
+		if (cfqd->async_cfqq[0][i])
+			cfq_put_queue(cfqd->async_cfqq[0][i]);
+		if (cfqd->async_cfqq[1][i])
+			cfq_put_queue(cfqd->async_cfqq[1][i]);
+		if (cfqd->async_idle_cfqq)
+			cfq_put_queue(cfqd->async_idle_cfqq);
+	}
+}
+
 static void cfq_exit_queue(elevator_t *e)
 {
 	struct cfq_data *cfqd = e->elevator_data;
 	request_queue_t *q = cfqd->queue;
-	int i;
 
 	cfq_shutdown_timer_wq(cfqd);
 
@@ -2063,12 +2100,7 @@ static void cfq_exit_queue(elevator_t *e)
 		__cfq_exit_single_io_context(cfqd, cic);
 	}
 
-	/*
-	 * Put the async queues
-	 */
-	for (i = 0; i < IOPRIO_BE_NR; i++)
-		if (cfqd->async_cfqq[i])	
-			cfq_put_queue(cfqd->async_cfqq[i]);
+	cfq_put_async_queues(cfqd);
 
 	spin_unlock_irq(q->queue_lock);
 
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 2eaa142cd061..baf29387cab4 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -53,6 +53,14 @@ static inline int task_ioprio(struct task_struct *task)
 	return IOPRIO_NORM;
 }
 
+static inline int task_ioprio_class(struct task_struct *task)
+{
+	if (ioprio_valid(task->ioprio))
+		return IOPRIO_PRIO_CLASS(task->ioprio);
+
+	return IOPRIO_CLASS_BE;
+}
+
 static inline int task_nice_ioprio(struct task_struct *task)
 {
 	return (task_nice(task) + 20) / 5;
-- 
cgit v1.3-8-gc7d7


From 7a05f067c0da139613cbe74583bb7d208a5f87b9 Mon Sep 17 00:00:00 2001
From: Liam Girdwood <lg@opensource.wolfsonmicro.com>
Date: Mon, 14 May 2007 11:05:09 +0200
Subject: [ALSA] ASoC S3C24xx machine drivers - I2C ID for LM4857

This patch adds I2C ID for the LM4857 audio amp and corrects the spacing
of the WM8731, WM8750 and WM8753 ID's.

Signed-off-by: Liam Girdwood <lg@opensource.wolfsonmicro.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jaroslav Kysela <perex@suse.cz>
---
 include/linux/i2c-id.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index aa83d4163096..b69014865714 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -115,9 +115,10 @@
 #define I2C_DRIVERID_KS0127	86	/* Samsung ks0127 video decoder */
 #define I2C_DRIVERID_TLV320AIC23B 87	/* TI TLV320AIC23B audio codec  */
 #define I2C_DRIVERID_ISL1208	88	/* Intersil ISL1208 RTC		*/
-#define I2C_DRIVERID_WM8731		89	/* Wolfson WM8731 audio codec */
-#define I2C_DRIVERID_WM8750		90	/* Wolfson WM8750 audio codec */
-#define I2C_DRIVERID_WM8753		91	/* Wolfson WM8753 audio codec */
+#define I2C_DRIVERID_WM8731	89	/* Wolfson WM8731 audio codec */
+#define I2C_DRIVERID_WM8750	90	/* Wolfson WM8750 audio codec */
+#define I2C_DRIVERID_WM8753	91	/* Wolfson WM8753 audio codec */
+#define I2C_DRIVERID_LM4857 	92 	/* LM4857 Audio Amplifier */
 
 #define I2C_DRIVERID_I2CDEV	900
 #define I2C_DRIVERID_ARP        902    /* SMBus ARP Client              */
-- 
cgit v1.3-8-gc7d7


From 9977126c4b65c1396b665f7a0eeb8c7dede336f9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 16 Jul 2007 14:29:38 +0900
Subject: libata: add @is_cmd to ata_tf_to_fis()

Add @is_cmd to ata_tf_to_fis().  This controls bit 7 of the second
byte which tells the device whether this H2D FIS is for a command or
not.  This cleans up ahci a bit and will be used by PMP.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/ata/ahci.c        | 10 ++++------
 drivers/ata/libata-core.c | 14 ++++++++------
 drivers/ata/sata_qstor.c  |  2 +-
 drivers/ata/sata_sil24.c  |  2 +-
 include/linux/libata.h    |  3 ++-
 5 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index eaec5e506f5e..61c5b6e68de4 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1020,8 +1020,7 @@ static int ahci_softreset(struct ata_port *ap, unsigned int *class,
 			   cmd_fis_len | AHCI_CMD_RESET | AHCI_CMD_CLR_BUSY);
 
 	tf.ctl |= ATA_SRST;
-	ata_tf_to_fis(&tf, fis, 0);
-	fis[1] &= ~(1 << 7);	/* turn off Command FIS bit */
+	ata_tf_to_fis(&tf, 0, 0, fis);
 
 	writel(1, port_mmio + PORT_CMD_ISSUE);
 
@@ -1039,8 +1038,7 @@ static int ahci_softreset(struct ata_port *ap, unsigned int *class,
 	ahci_fill_cmd_slot(pp, 0, cmd_fis_len);
 
 	tf.ctl &= ~ATA_SRST;
-	ata_tf_to_fis(&tf, fis, 0);
-	fis[1] &= ~(1 << 7);	/* turn off Command FIS bit */
+	ata_tf_to_fis(&tf, 0, 0, fis);
 
 	writel(1, port_mmio + PORT_CMD_ISSUE);
 	readl(port_mmio + PORT_CMD_ISSUE);	/* flush */
@@ -1088,7 +1086,7 @@ static int ahci_hardreset(struct ata_port *ap, unsigned int *class,
 	/* clear D2H reception area to properly wait for D2H FIS */
 	ata_tf_init(ap->device, &tf);
 	tf.command = 0x80;
-	ata_tf_to_fis(&tf, d2h_fis, 0);
+	ata_tf_to_fis(&tf, 0, 0, d2h_fis);
 
 	rc = sata_std_hardreset(ap, class, deadline);
 
@@ -1205,7 +1203,7 @@ static void ahci_qc_prep(struct ata_queued_cmd *qc)
 	 */
 	cmd_tbl = pp->cmd_tbl + qc->tag * AHCI_CMD_TBL_SZ;
 
-	ata_tf_to_fis(&qc->tf, cmd_tbl, 0);
+	ata_tf_to_fis(&qc->tf, 0, 1, cmd_tbl);
 	if (is_atapi) {
 		memset(cmd_tbl + AHCI_CMD_TBL_CDB, 0, 32);
 		memcpy(cmd_tbl + AHCI_CMD_TBL_CDB, qc->cdb, qc->dev->cdb_len);
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index e2ecb7a4628e..39a8e986a4ea 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -111,8 +111,9 @@ MODULE_VERSION(DRV_VERSION);
 /**
  *	ata_tf_to_fis - Convert ATA taskfile to SATA FIS structure
  *	@tf: Taskfile to convert
- *	@fis: Buffer into which data will output
  *	@pmp: Port multiplier port
+ *	@is_cmd: This FIS is for command
+ *	@fis: Buffer into which data will output
  *
  *	Converts a standard ATA taskfile to a Serial ATA
  *	FIS structure (Register - Host to Device).
@@ -120,12 +121,13 @@ MODULE_VERSION(DRV_VERSION);
  *	LOCKING:
  *	Inherited from caller.
  */
-
-void ata_tf_to_fis(const struct ata_taskfile *tf, u8 *fis, u8 pmp)
+void ata_tf_to_fis(const struct ata_taskfile *tf, u8 pmp, int is_cmd, u8 *fis)
 {
-	fis[0] = 0x27;	/* Register - Host to Device FIS */
-	fis[1] = (pmp & 0xf) | (1 << 7); /* Port multiplier number,
-					    bit 7 indicates Command FIS */
+	fis[0] = 0x27;			/* Register - Host to Device FIS */
+	fis[1] = pmp & 0xf;		/* Port multiplier number*/
+	if (is_cmd)
+		fis[1] |= (1 << 7);	/* bit 7 indicates Command FIS */
+
 	fis[2] = tf->command;
 	fis[3] = tf->feature;
 
diff --git a/drivers/ata/sata_qstor.c b/drivers/ata/sata_qstor.c
index 9ab554da89bf..5aef4ac37012 100644
--- a/drivers/ata/sata_qstor.c
+++ b/drivers/ata/sata_qstor.c
@@ -337,7 +337,7 @@ static void qs_qc_prep(struct ata_queued_cmd *qc)
 	buf[28] = dflags;
 
 	/* frame information structure (FIS) */
-	ata_tf_to_fis(&qc->tf, &buf[32], 0);
+	ata_tf_to_fis(&qc->tf, 0, 1, &buf[32]);
 }
 
 static inline void qs_packet_start(struct ata_queued_cmd *qc)
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index ac43a30ebe29..a11007b5071e 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -699,7 +699,7 @@ static void sil24_qc_prep(struct ata_queued_cmd *qc)
 	}
 
 	prb->ctrl = cpu_to_le16(ctrl);
-	ata_tf_to_fis(&qc->tf, prb->fis, 0);
+	ata_tf_to_fis(&qc->tf, 0, 1, prb->fis);
 
 	if (qc->flags & ATA_QCFLAG_DMAMAP)
 		sil24_fill_sg(qc, sge);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 47cd2a1c5544..5d3df6cde272 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -764,7 +764,8 @@ extern unsigned int ata_dev_try_classify(struct ata_port *, unsigned int, u8 *);
  */
 extern void ata_tf_load(struct ata_port *ap, const struct ata_taskfile *tf);
 extern void ata_tf_read(struct ata_port *ap, struct ata_taskfile *tf);
-extern void ata_tf_to_fis(const struct ata_taskfile *tf, u8 *fis, u8 pmp);
+extern void ata_tf_to_fis(const struct ata_taskfile *tf,
+			  u8 pmp, int is_cmd, u8 *fis);
 extern void ata_tf_from_fis(const u8 *fis, struct ata_taskfile *tf);
 extern void ata_noop_dev_select (struct ata_port *ap, unsigned int device);
 extern void ata_std_dev_select (struct ata_port *ap, unsigned int device);
-- 
cgit v1.3-8-gc7d7


From b64bbc39f2122a2276578e40144af69ef01decd4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 16 Jul 2007 14:29:39 +0900
Subject: libata: improve EH report formatting

Requiring LLDs to format multiple error description messages properly
doesn't work too well.  Help LLDs a bit by making ata_ehi_push_desc()
insert ", " on each invocation.  __ata_ehi_push_desc() is the raw
version without the automatic separator.

While at it, make ehi_desc interface proper functions instead of
macros.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/ata/ahci.c        |  6 ++---
 drivers/ata/libata-core.c |  3 +++
 drivers/ata/libata-eh.c   | 69 +++++++++++++++++++++++++++++++++++++++++++++--
 drivers/ata/sata_mv.c     |  8 +++---
 drivers/ata/sata_nv.c     | 22 ++++++++-------
 drivers/ata/sata_sil24.c  | 12 ++++-----
 include/linux/libata.h    | 13 +++------
 7 files changed, 98 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index c5034d450c62..210292cd8ad1 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1289,12 +1289,12 @@ static void ahci_error_intr(struct ata_port *ap, u32 irq_stat)
 	if (irq_stat & PORT_IRQ_IF_ERR) {
 		err_mask |= AC_ERR_ATA_BUS;
 		action |= ATA_EH_SOFTRESET;
-		ata_ehi_push_desc(ehi, ", interface fatal error");
+		ata_ehi_push_desc(ehi, "interface fatal error");
 	}
 
 	if (irq_stat & (PORT_IRQ_CONNECT | PORT_IRQ_PHYRDY)) {
 		ata_ehi_hotplugged(ehi);
-		ata_ehi_push_desc(ehi, ", %s", irq_stat & PORT_IRQ_CONNECT ?
+		ata_ehi_push_desc(ehi, "%s", irq_stat & PORT_IRQ_CONNECT ?
 			"connection status changed" : "PHY RDY changed");
 	}
 
@@ -1303,7 +1303,7 @@ static void ahci_error_intr(struct ata_port *ap, u32 irq_stat)
 
 		err_mask |= AC_ERR_HSM;
 		action |= ATA_EH_SOFTRESET;
-		ata_ehi_push_desc(ehi, ", unknown FIS %08x %08x %08x %08x",
+		ata_ehi_push_desc(ehi, "unknown FIS %08x %08x %08x %08x",
 				  unk[0], unk[1], unk[2], unk[3]);
 	}
 
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 39a8e986a4ea..ecbc3278238a 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -6945,6 +6945,9 @@ EXPORT_SYMBOL_GPL(ata_pci_default_filter);
 EXPORT_SYMBOL_GPL(ata_pci_clear_simplex);
 #endif /* CONFIG_PCI */
 
+EXPORT_SYMBOL_GPL(__ata_ehi_push_desc);
+EXPORT_SYMBOL_GPL(ata_ehi_push_desc);
+EXPORT_SYMBOL_GPL(ata_ehi_clear_desc);
 EXPORT_SYMBOL_GPL(ata_eng_timeout);
 EXPORT_SYMBOL_GPL(ata_port_schedule_eh);
 EXPORT_SYMBOL_GPL(ata_port_abort);
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 9aa62a0754f6..96b184ebf708 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -85,6 +85,71 @@ static void ata_eh_handle_port_resume(struct ata_port *ap)
 { }
 #endif /* CONFIG_PM */
 
+static void __ata_ehi_pushv_desc(struct ata_eh_info *ehi, const char *fmt,
+				 va_list args)
+{
+	ehi->desc_len += vscnprintf(ehi->desc + ehi->desc_len,
+				     ATA_EH_DESC_LEN - ehi->desc_len,
+				     fmt, args);
+}
+
+/**
+ *	__ata_ehi_push_desc - push error description without adding separator
+ *	@ehi: target EHI
+ *	@fmt: printf format string
+ *
+ *	Format string according to @fmt and append it to @ehi->desc.
+ *
+ *	LOCKING:
+ *	spin_lock_irqsave(host lock)
+ */
+void __ata_ehi_push_desc(struct ata_eh_info *ehi, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	__ata_ehi_pushv_desc(ehi, fmt, args);
+	va_end(args);
+}
+
+/**
+ *	ata_ehi_push_desc - push error description with separator
+ *	@ehi: target EHI
+ *	@fmt: printf format string
+ *
+ *	Format string according to @fmt and append it to @ehi->desc.
+ *	If @ehi->desc is not empty, ", " is added in-between.
+ *
+ *	LOCKING:
+ *	spin_lock_irqsave(host lock)
+ */
+void ata_ehi_push_desc(struct ata_eh_info *ehi, const char *fmt, ...)
+{
+	va_list args;
+
+	if (ehi->desc_len)
+		__ata_ehi_push_desc(ehi, ", ");
+
+	va_start(args, fmt);
+	__ata_ehi_pushv_desc(ehi, fmt, args);
+	va_end(args);
+}
+
+/**
+ *	ata_ehi_clear_desc - clean error description
+ *	@ehi: target EHI
+ *
+ *	Clear @ehi->desc.
+ *
+ *	LOCKING:
+ *	spin_lock_irqsave(host lock)
+ */
+void ata_ehi_clear_desc(struct ata_eh_info *ehi)
+{
+	ehi->desc[0] = '\0';
+	ehi->desc_len = 0;
+}
+
 static void ata_ering_record(struct ata_ering *ering, int is_io,
 			     unsigned int err_mask)
 {
@@ -1524,14 +1589,14 @@ static void ata_eh_report(struct ata_port *ap)
 			       ehc->i.err_mask, ap->sactive, ehc->i.serror,
 			       ehc->i.action, frozen);
 		if (desc)
-			ata_dev_printk(ehc->i.dev, KERN_ERR, "(%s)\n", desc);
+			ata_dev_printk(ehc->i.dev, KERN_ERR, "%s\n", desc);
 	} else {
 		ata_port_printk(ap, KERN_ERR, "exception Emask 0x%x "
 				"SAct 0x%x SErr 0x%x action 0x%x%s\n",
 				ehc->i.err_mask, ap->sactive, ehc->i.serror,
 				ehc->i.action, frozen);
 		if (desc)
-			ata_port_printk(ap, KERN_ERR, "(%s)\n", desc);
+			ata_port_printk(ap, KERN_ERR, "%s\n", desc);
 	}
 
 	for (tag = 0; tag < ATA_MAX_QUEUE; tag++) {
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 80ade5b93b86..b4b737e081e8 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -1411,12 +1411,12 @@ static void mv_err_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
 			EDMA_ERR_INTRL_PAR)) {
 		err_mask |= AC_ERR_ATA_BUS;
 		action |= ATA_EH_HARDRESET;
-		ata_ehi_push_desc(ehi, ", parity error");
+		ata_ehi_push_desc(ehi, "parity error");
 	}
 	if (edma_err_cause & (EDMA_ERR_DEV_DCON | EDMA_ERR_DEV_CON)) {
 		ata_ehi_hotplugged(ehi);
 		ata_ehi_push_desc(ehi, edma_err_cause & EDMA_ERR_DEV_DCON ?
-			", dev disconnect" : ", dev connect");
+			"dev disconnect" : "dev connect");
 	}
 
 	if (IS_GEN_I(hpriv)) {
@@ -1425,7 +1425,7 @@ static void mv_err_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
 		if (edma_err_cause & EDMA_ERR_SELF_DIS_5) {
 			struct mv_port_priv *pp	= ap->private_data;
 			pp->pp_flags &= ~MV_PP_FLAG_EDMA_EN;
-			ata_ehi_push_desc(ehi, ", EDMA self-disable");
+			ata_ehi_push_desc(ehi, "EDMA self-disable");
 		}
 	} else {
 		eh_freeze_mask = EDMA_EH_FREEZE;
@@ -1433,7 +1433,7 @@ static void mv_err_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
 		if (edma_err_cause & EDMA_ERR_SELF_DIS) {
 			struct mv_port_priv *pp	= ap->private_data;
 			pp->pp_flags &= ~MV_PP_FLAG_EDMA_EN;
-			ata_ehi_push_desc(ehi, ", EDMA self-disable");
+			ata_ehi_push_desc(ehi, "EDMA self-disable");
 		}
 
 		if (edma_err_cause & EDMA_ERR_SERR) {
diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index db81e3efa5ec..5d943da042f7 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -715,19 +715,20 @@ static int nv_adma_check_cpb(struct ata_port *ap, int cpb_num, int force_err)
 		int freeze = 0;
 
 		ata_ehi_clear_desc(ehi);
-		ata_ehi_push_desc(ehi, "CPB resp_flags 0x%x", flags );
+		__ata_ehi_push_desc(ehi, "CPB resp_flags 0x%x: ", flags );
 		if (flags & NV_CPB_RESP_ATA_ERR) {
-			ata_ehi_push_desc(ehi, ": ATA error");
+			ata_ehi_push_desc(ehi, "ATA error");
 			ehi->err_mask |= AC_ERR_DEV;
 		} else if (flags & NV_CPB_RESP_CMD_ERR) {
-			ata_ehi_push_desc(ehi, ": CMD error");
+			ata_ehi_push_desc(ehi, "CMD error");
 			ehi->err_mask |= AC_ERR_DEV;
 		} else if (flags & NV_CPB_RESP_CPB_ERR) {
-			ata_ehi_push_desc(ehi, ": CPB error");
+			ata_ehi_push_desc(ehi, "CPB error");
 			ehi->err_mask |= AC_ERR_SYSTEM;
 			freeze = 1;
 		} else {
 			/* notifier error, but no error in CPB flags? */
+			ata_ehi_push_desc(ehi, "unknown");
 			ehi->err_mask |= AC_ERR_OTHER;
 			freeze = 1;
 		}
@@ -854,20 +855,21 @@ static irqreturn_t nv_adma_interrupt(int irq, void *dev_instance)
 				struct ata_eh_info *ehi = &ap->eh_info;
 
 				ata_ehi_clear_desc(ehi);
-				ata_ehi_push_desc(ehi, "ADMA status 0x%08x", status );
+				__ata_ehi_push_desc(ehi, "ADMA status 0x%08x: ", status );
 				if (status & NV_ADMA_STAT_TIMEOUT) {
 					ehi->err_mask |= AC_ERR_SYSTEM;
-					ata_ehi_push_desc(ehi, ": timeout");
+					ata_ehi_push_desc(ehi, "timeout");
 				} else if (status & NV_ADMA_STAT_HOTPLUG) {
 					ata_ehi_hotplugged(ehi);
-					ata_ehi_push_desc(ehi, ": hotplug");
+					ata_ehi_push_desc(ehi, "hotplug");
 				} else if (status & NV_ADMA_STAT_HOTUNPLUG) {
 					ata_ehi_hotplugged(ehi);
-					ata_ehi_push_desc(ehi, ": hot unplug");
+					ata_ehi_push_desc(ehi, "hot unplug");
 				} else if (status & NV_ADMA_STAT_SERROR) {
 					/* let libata analyze SError and figure out the cause */
-					ata_ehi_push_desc(ehi, ": SError");
-				}
+					ata_ehi_push_desc(ehi, "SError");
+				} else
+					ata_ehi_push_desc(ehi, "unknown");
 				ata_port_freeze(ap);
 				continue;
 			}
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index e538edc1b4ea..e201f1cab66d 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -814,16 +814,16 @@ static void sil24_error_intr(struct ata_port *ap)
 
 	if (irq_stat & (PORT_IRQ_PHYRDY_CHG | PORT_IRQ_DEV_XCHG)) {
 		ata_ehi_hotplugged(ehi);
-		ata_ehi_push_desc(ehi, ", %s",
-			       irq_stat & PORT_IRQ_PHYRDY_CHG ?
-			       "PHY RDY changed" : "device exchanged");
+		ata_ehi_push_desc(ehi, "%s",
+				  irq_stat & PORT_IRQ_PHYRDY_CHG ?
+				  "PHY RDY changed" : "device exchanged");
 		freeze = 1;
 	}
 
 	if (irq_stat & PORT_IRQ_UNK_FIS) {
 		ehi->err_mask |= AC_ERR_HSM;
 		ehi->action |= ATA_EH_SOFTRESET;
-		ata_ehi_push_desc(ehi , ", unknown FIS");
+		ata_ehi_push_desc(ehi, "unknown FIS");
 		freeze = 1;
 	}
 
@@ -842,11 +842,11 @@ static void sil24_error_intr(struct ata_port *ap)
 		if (ci && ci->desc) {
 			err_mask |= ci->err_mask;
 			action |= ci->action;
-			ata_ehi_push_desc(ehi, ", %s", ci->desc);
+			ata_ehi_push_desc(ehi, "%s", ci->desc);
 		} else {
 			err_mask |= AC_ERR_OTHER;
 			action |= ATA_EH_SOFTRESET;
-			ata_ehi_push_desc(ehi, ", unknown command error %d",
+			ata_ehi_push_desc(ehi, "unknown command error %d",
 					  cerr);
 		}
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 5d3df6cde272..94b37d180680 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -910,16 +910,9 @@ extern void ata_do_eh(struct ata_port *ap, ata_prereset_fn_t prereset,
 /*
  * ata_eh_info helpers
  */
-#define ata_ehi_push_desc(ehi, fmt, args...) do { \
-	(ehi)->desc_len += scnprintf((ehi)->desc + (ehi)->desc_len, \
-				     ATA_EH_DESC_LEN - (ehi)->desc_len, \
-				     fmt , ##args); \
-} while (0)
-
-#define ata_ehi_clear_desc(ehi) do { \
-	(ehi)->desc[0] = '\0'; \
-	(ehi)->desc_len = 0; \
-} while (0)
+extern void __ata_ehi_push_desc(struct ata_eh_info *ehi, const char *fmt, ...);
+extern void ata_ehi_push_desc(struct ata_eh_info *ehi, const char *fmt, ...);
+extern void ata_ehi_clear_desc(struct ata_eh_info *ehi);
 
 static inline void __ata_ehi_hotplugged(struct ata_eh_info *ehi)
 {
-- 
cgit v1.3-8-gc7d7


From 5335b729064e03319cd2d5219770451dbb1d7f67 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 16 Jul 2007 14:29:40 +0900
Subject: libata: implement AC_ERR_NCQ

When an NCQ command fails, all commands in flight are aborted and the
offending one is reported using log page 10h.  Depending on controller
characteristics and LLD implementation, all commands may appear as
having a device error due to shared TF status making it hard to
determine what's actually going on.

This patch adds AC_ERR_NCQ, marks the command reported by log page 10h
with it and print extra "<F>" after the error report for the command
to help distinguishing the offending command.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/ata/libata-eh.c | 7 ++++---
 include/linux/libata.h  | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 96b184ebf708..19f9947bd96b 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -1195,7 +1195,7 @@ static void ata_eh_analyze_ncq_error(struct ata_port *ap)
 	/* we've got the perpetrator, condemn it */
 	qc = __ata_qc_from_tag(ap, tag);
 	memcpy(&qc->result_tf, &tf, sizeof(tf));
-	qc->err_mask |= AC_ERR_DEV;
+	qc->err_mask |= AC_ERR_DEV | AC_ERR_NCQ;
 	ehc->i.err_mask &= ~AC_ERR_DEV;
 }
 
@@ -1616,7 +1616,7 @@ static void ata_eh_report(struct ata_port *ap)
 			"cmd %02x/%02x:%02x:%02x:%02x:%02x/%02x:%02x:%02x:%02x:%02x/%02x "
 			"tag %d cdb 0x%x data %u %s\n         "
 			"res %02x/%02x:%02x:%02x:%02x:%02x/%02x:%02x:%02x:%02x:%02x/%02x "
-			"Emask 0x%x (%s)\n",
+			"Emask 0x%x (%s)%s\n",
 			cmd->command, cmd->feature, cmd->nsect,
 			cmd->lbal, cmd->lbam, cmd->lbah,
 			cmd->hob_feature, cmd->hob_nsect,
@@ -1627,7 +1627,8 @@ static void ata_eh_report(struct ata_port *ap)
 			res->lbal, res->lbam, res->lbah,
 			res->hob_feature, res->hob_nsect,
 			res->hob_lbal, res->hob_lbam, res->hob_lbah,
-			res->device, qc->err_mask, ata_err_string(qc->err_mask));
+			res->device, qc->err_mask, ata_err_string(qc->err_mask),
+			qc->err_mask & AC_ERR_NCQ ? " <F>" : "");
 	}
 }
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 94b37d180680..cb181713d9b5 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -323,6 +323,7 @@ enum ata_completion_errors {
 	AC_ERR_INVALID		= (1 << 7), /* invalid argument */
 	AC_ERR_OTHER		= (1 << 8), /* unknown */
 	AC_ERR_NODEV_HINT	= (1 << 9), /* polling device detection hint */
+	AC_ERR_NCQ		= (1 << 10), /* marker for offending NCQ qc */
 };
 
 /* forward declarations */
-- 
cgit v1.3-8-gc7d7


From da3dbb17a0e9a9ec7f5aed95f1fddadb790edc9d Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 16 Jul 2007 14:29:40 +0900
Subject: libata: make ->scr_read/write callbacks return error code

Convert ->scr_read/write callbacks to return error code to better
indicate failure.  This will help handling of SCR_NOTIFICATION.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/ata/ahci.c          | 23 ++++++++-------
 drivers/ata/libata-core.c   | 21 +++++++------
 drivers/ata/sata_inic162x.c | 16 +++++-----
 drivers/ata/sata_mv.c       | 72 ++++++++++++++++++++++++++++++---------------
 drivers/ata/sata_nv.c       | 16 +++++-----
 drivers/ata/sata_promise.c  | 25 +++++++++-------
 drivers/ata/sata_qstor.c    | 16 +++++-----
 drivers/ata/sata_sil.c      | 25 ++++++++++------
 drivers/ata/sata_sil24.c    | 17 +++++++----
 drivers/ata/sata_sis.c      | 22 +++++++-------
 drivers/ata/sata_svw.c      | 13 ++++----
 drivers/ata/sata_uli.c      | 16 +++++-----
 drivers/ata/sata_via.c      | 27 +++++++++--------
 drivers/ata/sata_vsc.c      | 13 ++++----
 include/linux/libata.h      |  5 ++--
 15 files changed, 191 insertions(+), 136 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 210292cd8ad1..e044d6477a0f 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -216,8 +216,8 @@ struct ahci_port_priv {
 	unsigned int		ncq_saw_sdb:1;
 };
 
-static u32 ahci_scr_read (struct ata_port *ap, unsigned int sc_reg);
-static void ahci_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val);
+static int ahci_scr_read(struct ata_port *ap, unsigned int sc_reg, u32 *val);
+static int ahci_scr_write(struct ata_port *ap, unsigned int sc_reg, u32 val);
 static int ahci_init_one (struct pci_dev *pdev, const struct pci_device_id *ent);
 static unsigned int ahci_qc_issue(struct ata_queued_cmd *qc);
 static void ahci_irq_clear(struct ata_port *ap);
@@ -625,7 +625,7 @@ static void ahci_restore_initial_config(struct ata_host *host)
 	(void) readl(mmio + HOST_PORTS_IMPL);	/* flush */
 }
 
-static u32 ahci_scr_read (struct ata_port *ap, unsigned int sc_reg_in)
+static int ahci_scr_read(struct ata_port *ap, unsigned int sc_reg_in, u32 *val)
 {
 	unsigned int sc_reg;
 
@@ -635,15 +635,15 @@ static u32 ahci_scr_read (struct ata_port *ap, unsigned int sc_reg_in)
 	case SCR_ERROR:		sc_reg = 2; break;
 	case SCR_ACTIVE:	sc_reg = 3; break;
 	default:
-		return 0xffffffffU;
+		return -EINVAL;
 	}
 
-	return readl(ap->ioaddr.scr_addr + (sc_reg * 4));
+	*val = readl(ap->ioaddr.scr_addr + (sc_reg * 4));
+	return 0;
 }
 
 
-static void ahci_scr_write (struct ata_port *ap, unsigned int sc_reg_in,
-			       u32 val)
+static int ahci_scr_write(struct ata_port *ap, unsigned int sc_reg_in, u32 val)
 {
 	unsigned int sc_reg;
 
@@ -653,10 +653,11 @@ static void ahci_scr_write (struct ata_port *ap, unsigned int sc_reg_in,
 	case SCR_ERROR:		sc_reg = 2; break;
 	case SCR_ACTIVE:	sc_reg = 3; break;
 	default:
-		return;
+		return -EINVAL;
 	}
 
 	writel(val, ap->ioaddr.scr_addr + (sc_reg * 4));
+	return 0;
 }
 
 static void ahci_start_engine(struct ata_port *ap)
@@ -1133,6 +1134,7 @@ static int ahci_hardreset(struct ata_port *ap, unsigned int *class,
 static int ahci_vt8251_hardreset(struct ata_port *ap, unsigned int *class,
 				 unsigned long deadline)
 {
+	u32 serror;
 	int rc;
 
 	DPRINTK("ENTER\n");
@@ -1143,7 +1145,8 @@ static int ahci_vt8251_hardreset(struct ata_port *ap, unsigned int *class,
 				 deadline);
 
 	/* vt8251 needs SError cleared for the port to operate */
-	ahci_scr_write(ap, SCR_ERROR, ahci_scr_read(ap, SCR_ERROR));
+	ahci_scr_read(ap, SCR_ERROR, &serror);
+	ahci_scr_write(ap, SCR_ERROR, serror);
 
 	ahci_start_engine(ap);
 
@@ -1265,7 +1268,7 @@ static void ahci_error_intr(struct ata_port *ap, u32 irq_stat)
 	ata_ehi_clear_desc(ehi);
 
 	/* AHCI needs SError cleared; otherwise, it might lock up */
-	serror = ahci_scr_read(ap, SCR_ERROR);
+	ahci_scr_read(ap, SCR_ERROR, &serror);
 	ahci_scr_write(ap, SCR_ERROR, serror);
 
 	/* analyze @irq_stat */
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index ecbc3278238a..5718c247e23a 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -5732,10 +5732,8 @@ int sata_scr_valid(struct ata_port *ap)
  */
 int sata_scr_read(struct ata_port *ap, int reg, u32 *val)
 {
-	if (sata_scr_valid(ap)) {
-		*val = ap->ops->scr_read(ap, reg);
-		return 0;
-	}
+	if (sata_scr_valid(ap))
+		return ap->ops->scr_read(ap, reg, val);
 	return -EOPNOTSUPP;
 }
 
@@ -5757,10 +5755,8 @@ int sata_scr_read(struct ata_port *ap, int reg, u32 *val)
  */
 int sata_scr_write(struct ata_port *ap, int reg, u32 val)
 {
-	if (sata_scr_valid(ap)) {
-		ap->ops->scr_write(ap, reg, val);
-		return 0;
-	}
+	if (sata_scr_valid(ap))
+		return ap->ops->scr_write(ap, reg, val);
 	return -EOPNOTSUPP;
 }
 
@@ -5781,10 +5777,13 @@ int sata_scr_write(struct ata_port *ap, int reg, u32 val)
  */
 int sata_scr_write_flush(struct ata_port *ap, int reg, u32 val)
 {
+	int rc;
+
 	if (sata_scr_valid(ap)) {
-		ap->ops->scr_write(ap, reg, val);
-		ap->ops->scr_read(ap, reg);
-		return 0;
+		rc = ap->ops->scr_write(ap, reg, val);
+		if (rc == 0)
+			rc = ap->ops->scr_read(ap, reg, &val);
+		return rc;
 	}
 	return -EOPNOTSUPP;
 }
diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
index 3de183461c3c..a9c948d7604a 100644
--- a/drivers/ata/sata_inic162x.c
+++ b/drivers/ata/sata_inic162x.c
@@ -190,34 +190,34 @@ static void inic_reset_port(void __iomem *port_base)
 	writew(ctl, idma_ctl);
 }
 
-static u32 inic_scr_read(struct ata_port *ap, unsigned sc_reg)
+static int inic_scr_read(struct ata_port *ap, unsigned sc_reg, u32 *val)
 {
 	void __iomem *scr_addr = ap->ioaddr.scr_addr;
 	void __iomem *addr;
-	u32 val;
 
 	if (unlikely(sc_reg >= ARRAY_SIZE(scr_map)))
-		return 0xffffffffU;
+		return -EINVAL;
 
 	addr = scr_addr + scr_map[sc_reg] * 4;
-	val = readl(scr_addr + scr_map[sc_reg] * 4);
+	*val = readl(scr_addr + scr_map[sc_reg] * 4);
 
 	/* this controller has stuck DIAG.N, ignore it */
 	if (sc_reg == SCR_ERROR)
-		val &= ~SERR_PHYRDY_CHG;
-	return val;
+		*val &= ~SERR_PHYRDY_CHG;
+	return 0;
 }
 
-static void inic_scr_write(struct ata_port *ap, unsigned sc_reg, u32 val)
+static int inic_scr_write(struct ata_port *ap, unsigned sc_reg, u32 val)
 {
 	void __iomem *scr_addr = ap->ioaddr.scr_addr;
 	void __iomem *addr;
 
 	if (unlikely(sc_reg >= ARRAY_SIZE(scr_map)))
-		return;
+		return -EINVAL;
 
 	addr = scr_addr + scr_map[sc_reg] * 4;
 	writel(val, scr_addr + scr_map[sc_reg] * 4);
+	return 0;
 }
 
 /*
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index b4b737e081e8..8ec520885b95 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -404,10 +404,10 @@ struct mv_host_priv {
 };
 
 static void mv_irq_clear(struct ata_port *ap);
-static u32 mv_scr_read(struct ata_port *ap, unsigned int sc_reg_in);
-static void mv_scr_write(struct ata_port *ap, unsigned int sc_reg_in, u32 val);
-static u32 mv5_scr_read(struct ata_port *ap, unsigned int sc_reg_in);
-static void mv5_scr_write(struct ata_port *ap, unsigned int sc_reg_in, u32 val);
+static int mv_scr_read(struct ata_port *ap, unsigned int sc_reg_in, u32 *val);
+static int mv_scr_write(struct ata_port *ap, unsigned int sc_reg_in, u32 val);
+static int mv5_scr_read(struct ata_port *ap, unsigned int sc_reg_in, u32 *val);
+static int mv5_scr_write(struct ata_port *ap, unsigned int sc_reg_in, u32 val);
 static int mv_port_start(struct ata_port *ap);
 static void mv_port_stop(struct ata_port *ap);
 static void mv_qc_prep(struct ata_queued_cmd *qc);
@@ -974,22 +974,26 @@ static unsigned int mv_scr_offset(unsigned int sc_reg_in)
 	return ofs;
 }
 
-static u32 mv_scr_read(struct ata_port *ap, unsigned int sc_reg_in)
+static int mv_scr_read(struct ata_port *ap, unsigned int sc_reg_in, u32 *val)
 {
 	unsigned int ofs = mv_scr_offset(sc_reg_in);
 
-	if (ofs != 0xffffffffU)
-		return readl(mv_ap_base(ap) + ofs);
-	else
-		return (u32) ofs;
+	if (ofs != 0xffffffffU) {
+		*val = readl(mv_ap_base(ap) + ofs);
+		return 0;
+	} else
+		return -EINVAL;
 }
 
-static void mv_scr_write(struct ata_port *ap, unsigned int sc_reg_in, u32 val)
+static int mv_scr_write(struct ata_port *ap, unsigned int sc_reg_in, u32 val)
 {
 	unsigned int ofs = mv_scr_offset(sc_reg_in);
 
-	if (ofs != 0xffffffffU)
+	if (ofs != 0xffffffffU) {
 		writelfl(val, mv_ap_base(ap) + ofs);
+		return 0;
+	} else
+		return -EINVAL;
 }
 
 static void mv_edma_cfg(struct ata_port *ap, struct mv_host_priv *hpriv,
@@ -1752,26 +1756,30 @@ static unsigned int mv5_scr_offset(unsigned int sc_reg_in)
 	return ofs;
 }
 
-static u32 mv5_scr_read(struct ata_port *ap, unsigned int sc_reg_in)
+static int mv5_scr_read(struct ata_port *ap, unsigned int sc_reg_in, u32 *val)
 {
 	void __iomem *mmio = ap->host->iomap[MV_PRIMARY_BAR];
 	void __iomem *addr = mv5_phy_base(mmio, ap->port_no);
 	unsigned int ofs = mv5_scr_offset(sc_reg_in);
 
-	if (ofs != 0xffffffffU)
-		return readl(addr + ofs);
-	else
-		return (u32) ofs;
+	if (ofs != 0xffffffffU) {
+		*val = readl(addr + ofs);
+		return 0;
+	} else
+		return -EINVAL;
 }
 
-static void mv5_scr_write(struct ata_port *ap, unsigned int sc_reg_in, u32 val)
+static int mv5_scr_write(struct ata_port *ap, unsigned int sc_reg_in, u32 val)
 {
 	void __iomem *mmio = ap->host->iomap[MV_PRIMARY_BAR];
 	void __iomem *addr = mv5_phy_base(mmio, ap->port_no);
 	unsigned int ofs = mv5_scr_offset(sc_reg_in);
 
-	if (ofs != 0xffffffffU)
+	if (ofs != 0xffffffffU) {
 		writelfl(val, addr + ofs);
+		return 0;
+	} else
+		return -EINVAL;
 }
 
 static void mv5_reset_bus(struct pci_dev *pdev, void __iomem *mmio)
@@ -2149,9 +2157,17 @@ static void mv_phy_reset(struct ata_port *ap, unsigned int *class,
 
 	VPRINTK("ENTER, port %u, mmio 0x%p\n", ap->port_no, port_mmio);
 
-	DPRINTK("S-regs after ATA_RST: SStat 0x%08x SErr 0x%08x "
-		"SCtrl 0x%08x\n", mv_scr_read(ap, SCR_STATUS),
-		mv_scr_read(ap, SCR_ERROR), mv_scr_read(ap, SCR_CONTROL));
+#ifdef DEBUG
+	{
+		u32 sstatus, serror, scontrol;
+
+		mv_scr_read(ap, SCR_STATUS, &sstatus);
+		mv_scr_read(ap, SCR_ERROR, &serror);
+		mv_scr_read(ap, SCR_CONTROL, &scontrol);
+		DPRINTK("S-regs after ATA_RST: SStat 0x%08x SErr 0x%08x "
+			"SCtrl 0x%08x\n", status, serror, scontrol);
+	}
+#endif
 
 	/* Issue COMRESET via SControl */
 comreset_retry:
@@ -2175,9 +2191,17 @@ comreset_retry:
 	    (retry-- > 0))
 		goto comreset_retry;
 
-	DPRINTK("S-regs after PHY wake: SStat 0x%08x SErr 0x%08x "
-		"SCtrl 0x%08x\n", mv_scr_read(ap, SCR_STATUS),
-		mv_scr_read(ap, SCR_ERROR), mv_scr_read(ap, SCR_CONTROL));
+#ifdef DEBUG
+	{
+		u32 sstatus, serror, scontrol;
+
+		mv_scr_read(ap, SCR_STATUS, &sstatus);
+		mv_scr_read(ap, SCR_ERROR, &serror);
+		mv_scr_read(ap, SCR_CONTROL, &scontrol);
+		DPRINTK("S-regs after PHY wake: SStat 0x%08x SErr 0x%08x "
+			"SCtrl 0x%08x\n", sstatus, serror, scontrol);
+	}
+#endif
 
 	if (ata_port_offline(ap)) {
 		*class = ATA_DEV_NONE;
diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index 5d943da042f7..0b58c4df6fd2 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -236,8 +236,8 @@ static void nv_ck804_host_stop(struct ata_host *host);
 static irqreturn_t nv_generic_interrupt(int irq, void *dev_instance);
 static irqreturn_t nv_nf2_interrupt(int irq, void *dev_instance);
 static irqreturn_t nv_ck804_interrupt(int irq, void *dev_instance);
-static u32 nv_scr_read (struct ata_port *ap, unsigned int sc_reg);
-static void nv_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val);
+static int nv_scr_read (struct ata_port *ap, unsigned int sc_reg, u32 *val);
+static int nv_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val);
 
 static void nv_nf2_freeze(struct ata_port *ap);
 static void nv_nf2_thaw(struct ata_port *ap);
@@ -1393,20 +1393,22 @@ static irqreturn_t nv_ck804_interrupt(int irq, void *dev_instance)
 	return ret;
 }
 
-static u32 nv_scr_read (struct ata_port *ap, unsigned int sc_reg)
+static int nv_scr_read(struct ata_port *ap, unsigned int sc_reg, u32 *val)
 {
 	if (sc_reg > SCR_CONTROL)
-		return 0xffffffffU;
+		return -EINVAL;
 
-	return ioread32(ap->ioaddr.scr_addr + (sc_reg * 4));
+	*val = ioread32(ap->ioaddr.scr_addr + (sc_reg * 4));
+	return 0;
 }
 
-static void nv_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val)
+static int nv_scr_write(struct ata_port *ap, unsigned int sc_reg, u32 val)
 {
 	if (sc_reg > SCR_CONTROL)
-		return;
+		return -EINVAL;
 
 	iowrite32(val, ap->ioaddr.scr_addr + (sc_reg * 4));
+	return 0;
 }
 
 static void nv_nf2_freeze(struct ata_port *ap)
diff --git a/drivers/ata/sata_promise.c b/drivers/ata/sata_promise.c
index d2fcb9a6bec2..d39ebc23c4a9 100644
--- a/drivers/ata/sata_promise.c
+++ b/drivers/ata/sata_promise.c
@@ -128,8 +128,8 @@ struct pdc_port_priv {
 	dma_addr_t		pkt_dma;
 };
 
-static u32 pdc_sata_scr_read (struct ata_port *ap, unsigned int sc_reg);
-static void pdc_sata_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val);
+static int pdc_sata_scr_read(struct ata_port *ap, unsigned int sc_reg, u32 *val);
+static int pdc_sata_scr_write(struct ata_port *ap, unsigned int sc_reg, u32 val);
 static int pdc_ata_init_one (struct pci_dev *pdev, const struct pci_device_id *ent);
 static int pdc_common_port_start(struct ata_port *ap);
 static int pdc_sata_port_start(struct ata_port *ap);
@@ -427,19 +427,20 @@ static int pdc_sata_cable_detect(struct ata_port *ap)
 	return ATA_CBL_SATA;
 }
 
-static u32 pdc_sata_scr_read (struct ata_port *ap, unsigned int sc_reg)
+static int pdc_sata_scr_read(struct ata_port *ap, unsigned int sc_reg, u32 *val)
 {
 	if (sc_reg > SCR_CONTROL)
-		return 0xffffffffU;
-	return readl(ap->ioaddr.scr_addr + (sc_reg * 4));
+		return -EINVAL;
+	*val = readl(ap->ioaddr.scr_addr + (sc_reg * 4));
+	return 0;
 }
 
-static void pdc_sata_scr_write (struct ata_port *ap, unsigned int sc_reg,
-			       u32 val)
+static int pdc_sata_scr_write(struct ata_port *ap, unsigned int sc_reg, u32 val)
 {
 	if (sc_reg > SCR_CONTROL)
-		return;
+		return -EINVAL;
 	writel(val, ap->ioaddr.scr_addr + (sc_reg * 4));
+	return 0;
 }
 
 static void pdc_atapi_pkt(struct ata_queued_cmd *qc)
@@ -642,8 +643,12 @@ static void pdc_error_intr(struct ata_port *ap, struct ata_queued_cmd *qc,
 			   | PDC_PCI_SYS_ERR | PDC1_PCI_PARITY_ERR))
 		ac_err_mask |= AC_ERR_HOST_BUS;
 
-	if (sata_scr_valid(ap))
-		ehi->serror |= pdc_sata_scr_read(ap, SCR_ERROR);
+	if (sata_scr_valid(ap)) {
+		u32 serror;
+
+		pdc_sata_scr_read(ap, SCR_ERROR, &serror);
+		ehi->serror |= serror;
+	}
 
 	qc->err_mask |= ac_err_mask;
 
diff --git a/drivers/ata/sata_qstor.c b/drivers/ata/sata_qstor.c
index 5aef4ac37012..c8f9242e7f44 100644
--- a/drivers/ata/sata_qstor.c
+++ b/drivers/ata/sata_qstor.c
@@ -111,8 +111,8 @@ struct qs_port_priv {
 	qs_state_t		state;
 };
 
-static u32 qs_scr_read (struct ata_port *ap, unsigned int sc_reg);
-static void qs_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val);
+static int qs_scr_read(struct ata_port *ap, unsigned int sc_reg, u32 *val);
+static int qs_scr_write(struct ata_port *ap, unsigned int sc_reg, u32 val);
 static int qs_ata_init_one (struct pci_dev *pdev, const struct pci_device_id *ent);
 static int qs_port_start(struct ata_port *ap);
 static void qs_host_stop(struct ata_host *host);
@@ -255,18 +255,20 @@ static void qs_eng_timeout(struct ata_port *ap)
 	ata_eng_timeout(ap);
 }
 
-static u32 qs_scr_read (struct ata_port *ap, unsigned int sc_reg)
+static int qs_scr_read(struct ata_port *ap, unsigned int sc_reg, u32 *val)
 {
 	if (sc_reg > SCR_CONTROL)
-		return ~0U;
-	return readl(ap->ioaddr.scr_addr + (sc_reg * 8));
+		return -EINVAL;
+	*val = readl(ap->ioaddr.scr_addr + (sc_reg * 8));
+	return 0;
 }
 
-static void qs_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val)
+static int qs_scr_write(struct ata_port *ap, unsigned int sc_reg, u32 val)
 {
 	if (sc_reg > SCR_CONTROL)
-		return;
+		return -EINVAL;
 	writel(val, ap->ioaddr.scr_addr + (sc_reg * 8));
+	return 0;
 }
 
 static unsigned int qs_fill_sg(struct ata_queued_cmd *qc)
diff --git a/drivers/ata/sata_sil.c b/drivers/ata/sata_sil.c
index 2a86dc4598d0..db6763758952 100644
--- a/drivers/ata/sata_sil.c
+++ b/drivers/ata/sata_sil.c
@@ -115,8 +115,8 @@ static int sil_init_one (struct pci_dev *pdev, const struct pci_device_id *ent);
 static int sil_pci_device_resume(struct pci_dev *pdev);
 #endif
 static void sil_dev_config(struct ata_device *dev);
-static u32 sil_scr_read (struct ata_port *ap, unsigned int sc_reg);
-static void sil_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val);
+static int sil_scr_read(struct ata_port *ap, unsigned int sc_reg, u32 *val);
+static int sil_scr_write(struct ata_port *ap, unsigned int sc_reg, u32 val);
 static int sil_set_mode (struct ata_port *ap, struct ata_device **r_failed);
 static void sil_freeze(struct ata_port *ap);
 static void sil_thaw(struct ata_port *ap);
@@ -350,19 +350,26 @@ static inline void __iomem *sil_scr_addr(struct ata_port *ap, unsigned int sc_re
 	return NULL;
 }
 
-static u32 sil_scr_read (struct ata_port *ap, unsigned int sc_reg)
+static int sil_scr_read(struct ata_port *ap, unsigned int sc_reg, u32 *val)
 {
 	void __iomem *mmio = sil_scr_addr(ap, sc_reg);
-	if (mmio)
-		return readl(mmio);
-	return 0xffffffffU;
+
+	if (mmio) {
+		*val = readl(mmio);
+		return 0;
+	}
+	return -EINVAL;
 }
 
-static void sil_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val)
+static int sil_scr_write(struct ata_port *ap, unsigned int sc_reg, u32 val)
 {
 	void __iomem *mmio = sil_scr_addr(ap, sc_reg);
-	if (mmio)
+
+	if (mmio) {
 		writel(val, mmio);
+		return 0;
+	}
+	return -EINVAL;
 }
 
 static void sil_host_intr(struct ata_port *ap, u32 bmdma2)
@@ -378,7 +385,7 @@ static void sil_host_intr(struct ata_port *ap, u32 bmdma2)
 		 * controllers continue to assert IRQ as long as
 		 * SError bits are pending.  Clear SError immediately.
 		 */
-		serror = sil_scr_read(ap, SCR_ERROR);
+		sil_scr_read(ap, SCR_ERROR, &serror);
 		sil_scr_write(ap, SCR_ERROR, serror);
 
 		/* Trigger hotplug and accumulate SError only if the
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index e201f1cab66d..46fbbe7f121c 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -326,8 +326,8 @@ struct sil24_port_priv {
 
 static void sil24_dev_config(struct ata_device *dev);
 static u8 sil24_check_status(struct ata_port *ap);
-static u32 sil24_scr_read(struct ata_port *ap, unsigned sc_reg);
-static void sil24_scr_write(struct ata_port *ap, unsigned sc_reg, u32 val);
+static int sil24_scr_read(struct ata_port *ap, unsigned sc_reg, u32 *val);
+static int sil24_scr_write(struct ata_port *ap, unsigned sc_reg, u32 val);
 static void sil24_tf_read(struct ata_port *ap, struct ata_taskfile *tf);
 static void sil24_qc_prep(struct ata_queued_cmd *qc);
 static unsigned int sil24_qc_issue(struct ata_queued_cmd *qc);
@@ -488,25 +488,30 @@ static int sil24_scr_map[] = {
 	[SCR_ACTIVE]	= 3,
 };
 
-static u32 sil24_scr_read(struct ata_port *ap, unsigned sc_reg)
+static int sil24_scr_read(struct ata_port *ap, unsigned sc_reg, u32 *val)
 {
 	void __iomem *scr_addr = ap->ioaddr.scr_addr;
+
 	if (sc_reg < ARRAY_SIZE(sil24_scr_map)) {
 		void __iomem *addr;
 		addr = scr_addr + sil24_scr_map[sc_reg] * 4;
-		return readl(scr_addr + sil24_scr_map[sc_reg] * 4);
+		*val = readl(scr_addr + sil24_scr_map[sc_reg] * 4);
+		return 0;
 	}
-	return 0xffffffffU;
+	return -EINVAL;
 }
 
-static void sil24_scr_write(struct ata_port *ap, unsigned sc_reg, u32 val)
+static int sil24_scr_write(struct ata_port *ap, unsigned sc_reg, u32 val)
 {
 	void __iomem *scr_addr = ap->ioaddr.scr_addr;
+
 	if (sc_reg < ARRAY_SIZE(sil24_scr_map)) {
 		void __iomem *addr;
 		addr = scr_addr + sil24_scr_map[sc_reg] * 4;
 		writel(val, scr_addr + sil24_scr_map[sc_reg] * 4);
+		return 0;
 	}
+	return -EINVAL;
 }
 
 static void sil24_tf_read(struct ata_port *ap, struct ata_taskfile *tf)
diff --git a/drivers/ata/sata_sis.c b/drivers/ata/sata_sis.c
index 33716b00c6b7..31a2f55aae66 100644
--- a/drivers/ata/sata_sis.c
+++ b/drivers/ata/sata_sis.c
@@ -64,8 +64,8 @@ enum {
 };
 
 static int sis_init_one (struct pci_dev *pdev, const struct pci_device_id *ent);
-static u32 sis_scr_read (struct ata_port *ap, unsigned int sc_reg);
-static void sis_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val);
+static int sis_scr_read (struct ata_port *ap, unsigned int sc_reg, u32 *val);
+static int sis_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val);
 
 static const struct pci_device_id sis_pci_tbl[] = {
 	{ PCI_VDEVICE(SI, 0x0180), sis_180 },		/* SiS 964/180 */
@@ -207,36 +207,37 @@ static void sis_scr_cfg_write (struct ata_port *ap, unsigned int sc_reg, u32 val
 		pci_write_config_dword(pdev, cfg_addr+0x10, val);
 }
 
-static u32 sis_scr_read (struct ata_port *ap, unsigned int sc_reg)
+static int sis_scr_read(struct ata_port *ap, unsigned int sc_reg, u32 *val)
 {
 	struct pci_dev *pdev = to_pci_dev(ap->host->dev);
-	u32 val, val2 = 0;
 	u8 pmr;
 
 	if (sc_reg > SCR_CONTROL)
-		return 0xffffffffU;
+		return -EINVAL;
 
 	if (ap->flags & SIS_FLAG_CFGSCR)
 		return sis_scr_cfg_read(ap, sc_reg);
 
 	pci_read_config_byte(pdev, SIS_PMR, &pmr);
 
-	val = ioread32(ap->ioaddr.scr_addr + (sc_reg * 4));
+	*val = ioread32(ap->ioaddr.scr_addr + (sc_reg * 4));
 
 	if ((pdev->device == 0x0182) || (pdev->device == 0x0183) ||
 	    (pdev->device == 0x1182) || (pmr & SIS_PMR_COMBINED))
-		val2 = ioread32(ap->ioaddr.scr_addr + (sc_reg * 4) + 0x10);
+		*val |= ioread32(ap->ioaddr.scr_addr + (sc_reg * 4) + 0x10);
+
+	*val &= 0xfffffffb;
 
-	return (val | val2) &  0xfffffffb;
+	return 0;
 }
 
-static void sis_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val)
+static int sis_scr_write(struct ata_port *ap, unsigned int sc_reg, u32 val)
 {
 	struct pci_dev *pdev = to_pci_dev(ap->host->dev);
 	u8 pmr;
 
 	if (sc_reg > SCR_CONTROL)
-		return;
+		return -EINVAL;
 
 	pci_read_config_byte(pdev, SIS_PMR, &pmr);
 
@@ -248,6 +249,7 @@ static void sis_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val)
 		    (pdev->device == 0x1182) || (pmr & SIS_PMR_COMBINED))
 			iowrite32(val, ap->ioaddr.scr_addr + (sc_reg * 4)+0x10);
 	}
+	return 0;
 }
 
 static int sis_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
diff --git a/drivers/ata/sata_svw.c b/drivers/ata/sata_svw.c
index 63fe99afd59f..92e877075037 100644
--- a/drivers/ata/sata_svw.c
+++ b/drivers/ata/sata_svw.c
@@ -103,20 +103,21 @@ static int k2_sata_check_atapi_dma(struct ata_queued_cmd *qc)
 	return 0;
 }
 
-static u32 k2_sata_scr_read (struct ata_port *ap, unsigned int sc_reg)
+static int k2_sata_scr_read(struct ata_port *ap, unsigned int sc_reg, u32 *val)
 {
 	if (sc_reg > SCR_CONTROL)
-		return 0xffffffffU;
-	return readl(ap->ioaddr.scr_addr + (sc_reg * 4));
+		return -EINVAL;
+	*val = readl(ap->ioaddr.scr_addr + (sc_reg * 4));
+	return 0;
 }
 
 
-static void k2_sata_scr_write (struct ata_port *ap, unsigned int sc_reg,
-			       u32 val)
+static int k2_sata_scr_write(struct ata_port *ap, unsigned int sc_reg, u32 val)
 {
 	if (sc_reg > SCR_CONTROL)
-		return;
+		return -EINVAL;
 	writel(val, ap->ioaddr.scr_addr + (sc_reg * 4));
+	return 0;
 }
 
 
diff --git a/drivers/ata/sata_uli.c b/drivers/ata/sata_uli.c
index b52f83ab056a..78c28512f01c 100644
--- a/drivers/ata/sata_uli.c
+++ b/drivers/ata/sata_uli.c
@@ -57,8 +57,8 @@ struct uli_priv {
 };
 
 static int uli_init_one (struct pci_dev *pdev, const struct pci_device_id *ent);
-static u32 uli_scr_read (struct ata_port *ap, unsigned int sc_reg);
-static void uli_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val);
+static int uli_scr_read (struct ata_port *ap, unsigned int sc_reg, u32 *val);
+static int uli_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val);
 
 static const struct pci_device_id uli_pci_tbl[] = {
 	{ PCI_VDEVICE(AL, 0x5289), uli_5289 },
@@ -164,20 +164,22 @@ static void uli_scr_cfg_write (struct ata_port *ap, unsigned int scr, u32 val)
 	pci_write_config_dword(pdev, cfg_addr, val);
 }
 
-static u32 uli_scr_read (struct ata_port *ap, unsigned int sc_reg)
+static int uli_scr_read (struct ata_port *ap, unsigned int sc_reg, u32 *val)
 {
 	if (sc_reg > SCR_CONTROL)
-		return 0xffffffffU;
+		return -EINVAL;
 
-	return uli_scr_cfg_read(ap, sc_reg);
+	*val = uli_scr_cfg_read(ap, sc_reg);
+	return 0;
 }
 
-static void uli_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val)
+static int uli_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val)
 {
 	if (sc_reg > SCR_CONTROL)	//SCR_CONTROL=2, SCR_ERROR=1, SCR_STATUS=0
-		return;
+		return -EINVAL;
 
 	uli_scr_cfg_write(ap, sc_reg, val);
+	return 0;
 }
 
 static int uli_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
diff --git a/drivers/ata/sata_via.c b/drivers/ata/sata_via.c
index c4124475f754..86b7bfc17324 100644
--- a/drivers/ata/sata_via.c
+++ b/drivers/ata/sata_via.c
@@ -72,8 +72,8 @@ enum {
 };
 
 static int svia_init_one (struct pci_dev *pdev, const struct pci_device_id *ent);
-static u32 svia_scr_read (struct ata_port *ap, unsigned int sc_reg);
-static void svia_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val);
+static int svia_scr_read(struct ata_port *ap, unsigned int sc_reg, u32 *val);
+static int svia_scr_write(struct ata_port *ap, unsigned int sc_reg, u32 val);
 static void svia_noop_freeze(struct ata_port *ap);
 static void vt6420_error_handler(struct ata_port *ap);
 static int vt6421_pata_cable_detect(struct ata_port *ap);
@@ -249,18 +249,20 @@ MODULE_LICENSE("GPL");
 MODULE_DEVICE_TABLE(pci, svia_pci_tbl);
 MODULE_VERSION(DRV_VERSION);
 
-static u32 svia_scr_read (struct ata_port *ap, unsigned int sc_reg)
+static int svia_scr_read(struct ata_port *ap, unsigned int sc_reg, u32 *val)
 {
 	if (sc_reg > SCR_CONTROL)
-		return 0xffffffffU;
-	return ioread32(ap->ioaddr.scr_addr + (4 * sc_reg));
+		return -EINVAL;
+	*val = ioread32(ap->ioaddr.scr_addr + (4 * sc_reg));
+	return 0;
 }
 
-static void svia_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val)
+static int svia_scr_write(struct ata_port *ap, unsigned int sc_reg, u32 val)
 {
 	if (sc_reg > SCR_CONTROL)
-		return;
+		return -EINVAL;
 	iowrite32(val, ap->ioaddr.scr_addr + (4 * sc_reg));
+	return 0;
 }
 
 static void svia_noop_freeze(struct ata_port *ap)
@@ -305,18 +307,19 @@ static int vt6420_prereset(struct ata_port *ap, unsigned long deadline)
 
 	/* Resume phy.  This is the old SATA resume sequence */
 	svia_scr_write(ap, SCR_CONTROL, 0x300);
-	svia_scr_read(ap, SCR_CONTROL); /* flush */
+	svia_scr_read(ap, SCR_CONTROL, &scontrol); /* flush */
 
 	/* wait for phy to become ready, if necessary */
 	do {
 		msleep(200);
-		if ((svia_scr_read(ap, SCR_STATUS) & 0xf) != 1)
+		svia_scr_read(ap, SCR_STATUS, &sstatus);
+		if ((sstatus & 0xf) != 1)
 			break;
 	} while (time_before(jiffies, timeout));
 
 	/* open code sata_print_link_status() */
-	sstatus = svia_scr_read(ap, SCR_STATUS);
-	scontrol = svia_scr_read(ap, SCR_CONTROL);
+	svia_scr_read(ap, SCR_STATUS, &sstatus);
+	svia_scr_read(ap, SCR_CONTROL, &scontrol);
 
 	online = (sstatus & 0xf) == 0x3;
 
@@ -325,7 +328,7 @@ static int vt6420_prereset(struct ata_port *ap, unsigned long deadline)
 			online ? "up" : "down", sstatus, scontrol);
 
 	/* SStatus is read one more time */
-	svia_scr_read(ap, SCR_STATUS);
+	svia_scr_read(ap, SCR_STATUS, &sstatus);
 
 	if (!online) {
 		/* tell EH to bail */
diff --git a/drivers/ata/sata_vsc.c b/drivers/ata/sata_vsc.c
index 1b5d81faa102..24344d0d0575 100644
--- a/drivers/ata/sata_vsc.c
+++ b/drivers/ata/sata_vsc.c
@@ -98,20 +98,21 @@ enum {
 			      VSC_SATA_INT_PHY_CHANGE),
 };
 
-static u32 vsc_sata_scr_read (struct ata_port *ap, unsigned int sc_reg)
+static int vsc_sata_scr_read(struct ata_port *ap, unsigned int sc_reg, u32 *val)
 {
 	if (sc_reg > SCR_CONTROL)
-		return 0xffffffffU;
-	return readl(ap->ioaddr.scr_addr + (sc_reg * 4));
+		return -EINVAL;
+	*val = readl(ap->ioaddr.scr_addr + (sc_reg * 4));
+	return 0;
 }
 
 
-static void vsc_sata_scr_write (struct ata_port *ap, unsigned int sc_reg,
-			       u32 val)
+static int vsc_sata_scr_write(struct ata_port *ap, unsigned int sc_reg, u32 val)
 {
 	if (sc_reg > SCR_CONTROL)
-		return;
+		return -EINVAL;
 	writel(val, ap->ioaddr.scr_addr + (sc_reg * 4));
+	return 0;
 }
 
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index cb181713d9b5..c732b3e78e28 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -620,9 +620,8 @@ struct ata_port_operations {
 	u8 (*irq_on) (struct ata_port *);
 	u8 (*irq_ack) (struct ata_port *ap, unsigned int chk_drq);
 
-	u32 (*scr_read) (struct ata_port *ap, unsigned int sc_reg);
-	void (*scr_write) (struct ata_port *ap, unsigned int sc_reg,
-			   u32 val);
+	int (*scr_read) (struct ata_port *ap, unsigned int sc_reg, u32 *val);
+	int (*scr_write) (struct ata_port *ap, unsigned int sc_reg, u32 val);
 
 	int (*port_suspend) (struct ata_port *ap, pm_message_t mesg);
 	int (*port_resume) (struct ata_port *ap);
-- 
cgit v1.3-8-gc7d7


From 008a78961ec72990d09d7625ef9499d7317d040d Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 16 Jul 2007 14:29:40 +0900
Subject: libata: improve SATA PHY speed down logic

sata_down_spd_limit() first reads the current SPD from SStatus and
limit the speed to the lower one of one below the current limit or one
below the current SPD in SStatus.  SPD may not be accessible or valid
when SPD down is requested making sata_down_spd_limit() fail when it's
most needed.

This patch makes the current SPD cached after each successful reset
and forces GEN I speed (1.5Gbps) if neither of SStatus or the cached
value is valid, so sata_down_spd_limit() is now guaranteed to lower
the speed limit if lower speed is available.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/ata/libata-core.c | 29 ++++++++++++++++++++++-------
 drivers/ata/libata-eh.c   |  6 ++++++
 include/linux/libata.h    |  1 +
 3 files changed, 29 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 5718c247e23a..c325b7a4246a 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2389,21 +2389,35 @@ int sata_down_spd_limit(struct ata_port *ap)
 	u32 sstatus, spd, mask;
 	int rc, highbit;
 
+	if (!sata_scr_valid(ap))
+		return -EOPNOTSUPP;
+
+	/* If SCR can be read, use it to determine the current SPD.
+	 * If not, use cached value in ap->sata_spd.
+	 */
 	rc = sata_scr_read(ap, SCR_STATUS, &sstatus);
-	if (rc)
-		return rc;
+	if (rc == 0)
+		spd = (sstatus >> 4) & 0xf;
+	else
+		spd = ap->sata_spd;
 
 	mask = ap->sata_spd_limit;
 	if (mask <= 1)
 		return -EINVAL;
+
+	/* unconditionally mask off the highest bit */
 	highbit = fls(mask) - 1;
 	mask &= ~(1 << highbit);
 
-	spd = (sstatus >> 4) & 0xf;
-	if (spd <= 1)
-		return -EINVAL;
-	spd--;
-	mask &= (1 << spd) - 1;
+	/* Mask off all speeds higher than or equal to the current
+	 * one.  Force 1.5Gbps if current SPD is not available.
+	 */
+	if (spd > 1)
+		mask &= (1 << (spd - 1)) - 1;
+	else
+		mask &= 1;
+
+	/* were we already at the bottom? */
 	if (!mask)
 		return -EINVAL;
 
@@ -5995,6 +6009,7 @@ void ata_dev_init(struct ata_device *dev)
 
 	/* SATA spd limit is bound to the first device */
 	ap->sata_spd_limit = ap->hw_sata_spd_limit;
+	ap->sata_spd = 0;
 
 	/* High bits of dev->flags are used to record warm plug
 	 * requests which occur asynchronously.  Synchronize using
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 19f9947bd96b..183eaf466d4f 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -1799,12 +1799,18 @@ static int ata_eh_reset(struct ata_port *ap, int classify,
 	}
 
 	if (rc == 0) {
+		u32 sstatus;
+
 		/* After the reset, the device state is PIO 0 and the
 		 * controller state is undefined.  Record the mode.
 		 */
 		for (i = 0; i < ATA_MAX_DEVICES; i++)
 			ap->device[i].pio_mode = XFER_PIO_0;
 
+		/* record current link speed */
+		if (sata_scr_read(ap, SCR_STATUS, &sstatus) == 0)
+			ap->sata_spd = (sstatus >> 4) & 0xf;
+
 		if (postreset)
 			postreset(ap, classes);
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index c732b3e78e28..16ebdf152c75 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -531,6 +531,7 @@ struct ata_port {
 	unsigned int		cbl;	/* cable type; ATA_CBL_xxx */
 	unsigned int		hw_sata_spd_limit;
 	unsigned int		sata_spd_limit;	/* SATA PHY speed limit */
+	unsigned int		sata_spd;	/* current SATA PHY speed */
 
 	/* record runtime error info, protected by host lock */
 	struct ata_eh_info	eh_info;
-- 
cgit v1.3-8-gc7d7


From f8f1e1cc0cd4d75c73e9a55a0ede8958e4fa14f1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 16 Jul 2007 14:29:40 +0900
Subject: libata: reorganize ata_ehi_hotplugged()

__ata_ehi_hotplugged() now has no users.  Regorganize
ata_ehi_hotplugged() such that a new function ata_ehi_schedule_probe()
deals with scheduling probing.  ata_ehi_hotplugged() calls it and
additionally marks hotplug specific flags.  ata_ehi_schedule_probe()
will be used laster.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 include/linux/libata.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 16ebdf152c75..74800ad6d81f 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -915,16 +915,17 @@ extern void __ata_ehi_push_desc(struct ata_eh_info *ehi, const char *fmt, ...);
 extern void ata_ehi_push_desc(struct ata_eh_info *ehi, const char *fmt, ...);
 extern void ata_ehi_clear_desc(struct ata_eh_info *ehi);
 
-static inline void __ata_ehi_hotplugged(struct ata_eh_info *ehi)
+static inline void ata_ehi_schedule_probe(struct ata_eh_info *ehi)
 {
-	ehi->flags |= ATA_EHI_HOTPLUGGED | ATA_EHI_RESUME_LINK;
+	ehi->flags |= ATA_EHI_RESUME_LINK;
 	ehi->action |= ATA_EH_SOFTRESET;
 	ehi->probe_mask |= (1 << ATA_MAX_DEVICES) - 1;
 }
 
 static inline void ata_ehi_hotplugged(struct ata_eh_info *ehi)
 {
-	__ata_ehi_hotplugged(ehi);
+	ata_ehi_schedule_probe(ehi);
+	ehi->flags |= ATA_EHI_HOTPLUGGED;
 	ehi->err_mask |= AC_ERR_ATA_BUS;
 }
 
-- 
cgit v1.3-8-gc7d7


From 5ddf24c5ea9d715dc4f5d5d5dd1c9337d90466dc Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 16 Jul 2007 14:29:41 +0900
Subject: libata: implement EH fast drain

In most cases, when EH is scheduled, all in-flight commands are
aborted causing EH to kick in immediately.  However, in some cases
(especially with PMP), it's unclear which commands are affected by the
error condition and although aborting all in-flight commands work, it
isn't optimal and may cause unnecessary disruption.  On the other
hand, waiting for in-flight commands to drain themselves can take up
to 30seconds.

This patch implements EH fast drain to handle such situations.  It
gives in-flight commands some time to finish up but doesn't wait for
too long.  After EH is scheduled, fast drain timer is started and if
no other completion occurs in ATA_EH_FASTDRAIN_INTERVAL all in-flight
commands are aborted.  If any completion occurred in the interval, the
port is given another interval to finish up itself.

Currently ATA_EH_FASTDRAIN_INTERVAL is 3 secs which should be enough
for finishing up most commands.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/ata/libata-core.c |  3 ++
 drivers/ata/libata-eh.c   | 99 ++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/ata/libata.h      |  1 +
 include/linux/libata.h    |  3 ++
 4 files changed, 104 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 35b621293831..6001aae0b884 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -6077,6 +6077,9 @@ struct ata_port *ata_port_alloc(struct ata_host *host)
 	INIT_WORK(&ap->scsi_rescan_task, ata_scsi_dev_rescan);
 	INIT_LIST_HEAD(&ap->eh_done_q);
 	init_waitqueue_head(&ap->eh_wait_q);
+	init_timer_deferrable(&ap->fastdrain_timer);
+	ap->fastdrain_timer.function = ata_eh_fastdrain_timerfn;
+	ap->fastdrain_timer.data = (unsigned long)ap;
 
 	ap->cbl = ATA_CBL_NONE;
 
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index e7e2ba24ce66..ac6ceed4bb60 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -56,6 +56,7 @@ enum {
  */
 enum {
 	ATA_EH_PRERESET_TIMEOUT		= 10 * HZ,
+	ATA_EH_FASTDRAIN_INTERVAL	= 3 * HZ,
 };
 
 /* The following table determines how we sequence resets.  Each entry
@@ -361,6 +362,9 @@ void ata_scsi_error(struct Scsi_Host *host)
  repeat:
 	/* invoke error handler */
 	if (ap->ops->error_handler) {
+		/* kill fast drain timer */
+		del_timer_sync(&ap->fastdrain_timer);
+
 		/* process port resume request */
 		ata_eh_handle_port_resume(ap);
 
@@ -576,6 +580,94 @@ void ata_eng_timeout(struct ata_port *ap)
 	DPRINTK("EXIT\n");
 }
 
+static int ata_eh_nr_in_flight(struct ata_port *ap)
+{
+	unsigned int tag;
+	int nr = 0;
+
+	/* count only non-internal commands */
+	for (tag = 0; tag < ATA_MAX_QUEUE - 1; tag++)
+		if (ata_qc_from_tag(ap, tag))
+			nr++;
+
+	return nr;
+}
+
+void ata_eh_fastdrain_timerfn(unsigned long arg)
+{
+	struct ata_port *ap = (void *)arg;
+	unsigned long flags;
+	int cnt;
+
+	spin_lock_irqsave(ap->lock, flags);
+
+	cnt = ata_eh_nr_in_flight(ap);
+
+	/* are we done? */
+	if (!cnt)
+		goto out_unlock;
+
+	if (cnt == ap->fastdrain_cnt) {
+		unsigned int tag;
+
+		/* No progress during the last interval, tag all
+		 * in-flight qcs as timed out and freeze the port.
+		 */
+		for (tag = 0; tag < ATA_MAX_QUEUE - 1; tag++) {
+			struct ata_queued_cmd *qc = ata_qc_from_tag(ap, tag);
+			if (qc)
+				qc->err_mask |= AC_ERR_TIMEOUT;
+		}
+
+		ata_port_freeze(ap);
+	} else {
+		/* some qcs have finished, give it another chance */
+		ap->fastdrain_cnt = cnt;
+		ap->fastdrain_timer.expires =
+			jiffies + ATA_EH_FASTDRAIN_INTERVAL;
+		add_timer(&ap->fastdrain_timer);
+	}
+
+ out_unlock:
+	spin_unlock_irqrestore(ap->lock, flags);
+}
+
+/**
+ *	ata_eh_set_pending - set ATA_PFLAG_EH_PENDING and activate fast drain
+ *	@ap: target ATA port
+ *	@fastdrain: activate fast drain
+ *
+ *	Set ATA_PFLAG_EH_PENDING and activate fast drain if @fastdrain
+ *	is non-zero and EH wasn't pending before.  Fast drain ensures
+ *	that EH kicks in in timely manner.
+ *
+ *	LOCKING:
+ *	spin_lock_irqsave(host lock)
+ */
+static void ata_eh_set_pending(struct ata_port *ap, int fastdrain)
+{
+	int cnt;
+
+	/* already scheduled? */
+	if (ap->pflags & ATA_PFLAG_EH_PENDING)
+		return;
+
+	ap->pflags |= ATA_PFLAG_EH_PENDING;
+
+	if (!fastdrain)
+		return;
+
+	/* do we have in-flight qcs? */
+	cnt = ata_eh_nr_in_flight(ap);
+	if (!cnt)
+		return;
+
+	/* activate fast drain */
+	ap->fastdrain_cnt = cnt;
+	ap->fastdrain_timer.expires = jiffies + ATA_EH_FASTDRAIN_INTERVAL;
+	add_timer(&ap->fastdrain_timer);
+}
+
 /**
  *	ata_qc_schedule_eh - schedule qc for error handling
  *	@qc: command to schedule error handling for
@@ -593,7 +685,7 @@ void ata_qc_schedule_eh(struct ata_queued_cmd *qc)
 	WARN_ON(!ap->ops->error_handler);
 
 	qc->flags |= ATA_QCFLAG_FAILED;
-	qc->ap->pflags |= ATA_PFLAG_EH_PENDING;
+	ata_eh_set_pending(ap, 1);
 
 	/* The following will fail if timeout has already expired.
 	 * ata_scsi_error() takes care of such scmds on EH entry.
@@ -620,7 +712,7 @@ void ata_port_schedule_eh(struct ata_port *ap)
 	if (ap->pflags & ATA_PFLAG_INITIALIZING)
 		return;
 
-	ap->pflags |= ATA_PFLAG_EH_PENDING;
+	ata_eh_set_pending(ap, 1);
 	scsi_schedule_eh(ap->scsi_host);
 
 	DPRINTK("port EH scheduled\n");
@@ -644,6 +736,9 @@ int ata_port_abort(struct ata_port *ap)
 
 	WARN_ON(!ap->ops->error_handler);
 
+	/* we're gonna abort all commands, no need for fast drain */
+	ata_eh_set_pending(ap, 0);
+
 	for (tag = 0; tag < ATA_MAX_QUEUE; tag++) {
 		struct ata_queued_cmd *qc = ata_qc_from_tag(ap, tag);
 
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 48836b22ce2f..564cd234c805 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -151,6 +151,7 @@ extern int ata_bus_probe(struct ata_port *ap);
 extern enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
 extern void ata_scsi_error(struct Scsi_Host *host);
 extern void ata_port_wait_eh(struct ata_port *ap);
+extern void ata_eh_fastdrain_timerfn(unsigned long arg);
 extern void ata_qc_schedule_eh(struct ata_queued_cmd *qc);
 
 /* libata-sff.c */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 74800ad6d81f..be5a43928c84 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -565,6 +565,9 @@ struct ata_port {
 	pm_message_t		pm_mesg;
 	int			*pm_result;
 
+	struct timer_list	fastdrain_timer;
+	unsigned long		fastdrain_cnt;
+
 	void			*private_data;
 
 #ifdef CONFIG_ATA_ACPI
-- 
cgit v1.3-8-gc7d7


From d046943cbaf332f75284ad99f4b3e60bae7ffff2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Fri, 20 Jul 2007 16:18:06 +0100
Subject: fix gfp_t annotations for slub

	Since we have use like ~SLUB_DMA, we ought to have the type
set right in both cases.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 07f7e4cbcee3..124270df8734 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -160,7 +160,7 @@ static inline struct kmem_cache *kmalloc_slab(size_t size)
 #define SLUB_DMA __GFP_DMA
 #else
 /* Disable DMA functionality */
-#define SLUB_DMA 0
+#define SLUB_DMA (__force gfp_t)0
 #endif
 
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
-- 
cgit v1.3-8-gc7d7


From eb0645a8b1f14da300f40bb9f424640cd1181fbf Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 20 Jul 2007 00:31:46 -0700
Subject: async_tx: fix kmap_atomic usage in async_memcpy

Andrew Morton:
	[async_memcpy] is very wrong if both ASYNC_TX_KMAP_DST and
	ASYNC_TX_KMAP_SRC can ever be set.  We'll end up using the same kmap
	slot for both src add dest and we get either corrupted data or a BUG.

Evgeniy Polyakov:
	Btw, shouldn't it always be kmap_atomic() even if flag is not set.
	That pages are usual one returned by alloc_page().

So fix the usage of kmap_atomic and kill the ASYNC_TX_KMAP_DST and
ASYNC_TX_KMAP_SRC flags.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 crypto/async_tx/async_memcpy.c | 19 ++++---------------
 drivers/md/raid5.c             |  4 ++--
 include/linux/async_tx.h       |  6 ------
 3 files changed, 6 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/async_tx/async_memcpy.c b/crypto/async_tx/async_memcpy.c
index a973f4ef897d..047e533fcc5b 100644
--- a/crypto/async_tx/async_memcpy.c
+++ b/crypto/async_tx/async_memcpy.c
@@ -36,7 +36,6 @@
  * @offset: offset in pages to start transaction
  * @len: length in bytes
  * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK,
- *	ASYNC_TX_KMAP_SRC, ASYNC_TX_KMAP_DST
  * @depend_tx: memcpy depends on the result of this transaction
  * @cb_fn: function to call when the memcpy completes
  * @cb_param: parameter to pass to the callback routine
@@ -88,23 +87,13 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
 					__FUNCTION__);
 		}
 
-		if (flags & ASYNC_TX_KMAP_DST)
-			dest_buf = kmap_atomic(dest, KM_USER0) + dest_offset;
-		else
-			dest_buf = page_address(dest) + dest_offset;
-
-		if (flags & ASYNC_TX_KMAP_SRC)
-			src_buf = kmap_atomic(src, KM_USER0) + src_offset;
-		else
-			src_buf = page_address(src) + src_offset;
+		dest_buf = kmap_atomic(dest, KM_USER0) + dest_offset;
+		src_buf = kmap_atomic(src, KM_USER1) + src_offset;
 
 		memcpy(dest_buf, src_buf, len);
 
-		if (flags & ASYNC_TX_KMAP_DST)
-			kunmap_atomic(dest_buf, KM_USER0);
-
-		if (flags & ASYNC_TX_KMAP_SRC)
-			kunmap_atomic(src_buf, KM_USER0);
+		kunmap_atomic(dest_buf, KM_USER0);
+		kunmap_atomic(src_buf, KM_USER1);
 
 		async_tx_sync_epilog(flags, depend_tx, cb_fn, cb_param);
 	}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c8dfdb302916..d90ee145effe 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -493,12 +493,12 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
 			if (frombio)
 				tx = async_memcpy(page, bio_page, page_offset,
 					b_offset, clen,
-					ASYNC_TX_DEP_ACK | ASYNC_TX_KMAP_SRC,
+					ASYNC_TX_DEP_ACK,
 					tx, NULL, NULL);
 			else
 				tx = async_memcpy(bio_page, page, b_offset,
 					page_offset, clen,
-					ASYNC_TX_DEP_ACK | ASYNC_TX_KMAP_DST,
+					ASYNC_TX_DEP_ACK,
 					tx, NULL, NULL);
 		}
 		if (clen < len) /* hit end of page */
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index ff1255079fa1..bdca3f1b3213 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -51,10 +51,6 @@ struct dma_chan_ref {
  * @ASYNC_TX_ACK: immediately ack the descriptor, precludes setting up a
  * dependency chain
  * @ASYNC_TX_DEP_ACK: ack the dependency descriptor.  Useful for chaining.
- * @ASYNC_TX_KMAP_SRC: if the transaction is to be performed synchronously
- * take an atomic mapping (KM_USER0) on the source page(s)
- * @ASYNC_TX_KMAP_DST: if the transaction is to be performed synchronously
- * take an atomic mapping (KM_USER0) on the dest page(s)
  */
 enum async_tx_flags {
 	ASYNC_TX_XOR_ZERO_DST	 = (1 << 0),
@@ -62,8 +58,6 @@ enum async_tx_flags {
 	ASYNC_TX_ASSUME_COHERENT = (1 << 2),
 	ASYNC_TX_ACK		 = (1 << 3),
 	ASYNC_TX_DEP_ACK	 = (1 << 4),
-	ASYNC_TX_KMAP_SRC	 = (1 << 5),
-	ASYNC_TX_KMAP_DST	 = (1 << 6),
 };
 
 #ifdef CONFIG_DMA_ENGINE
-- 
cgit v1.3-8-gc7d7


From 0aa366f351d044703e25c8425e508170e80d83b1 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Fri, 20 Jul 2007 11:22:30 -0700
Subject: [IA64] Convert to generic timekeeping/clocksource

This is a merge of Peter Keilty's initial patch (which was
revived by Bob Picco) for this with Hidetoshi Seto's fixes
and scaling improvements.

Acked-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 Documentation/kernel-parameters.txt   |   2 +
 arch/ia64/Kconfig                     |   6 +-
 arch/ia64/configs/bigsur_defconfig    |   2 +-
 arch/ia64/configs/gensparse_defconfig |   2 +-
 arch/ia64/configs/sim_defconfig       |   2 +-
 arch/ia64/configs/sn2_defconfig       |   2 +-
 arch/ia64/configs/tiger_defconfig     |   2 +-
 arch/ia64/configs/zx1_defconfig       |   2 +-
 arch/ia64/defconfig                   |   2 +-
 arch/ia64/kernel/asm-offsets.c        |  35 ++++---
 arch/ia64/kernel/cyclone.c            |  46 ++++++---
 arch/ia64/kernel/fsys.S               | 179 +++++++++++++++++-----------------
 arch/ia64/kernel/fsyscall_gtod_data.h |  23 +++++
 arch/ia64/kernel/time.c               |  96 ++++++++++++++++--
 arch/ia64/sn/kernel/sn2/timer.c       |  29 ++++--
 drivers/acpi/processor_idle.c         |   4 +-
 drivers/char/hpet.c                   |  70 +++++++------
 include/linux/clocksource.h           |   6 ++
 18 files changed, 335 insertions(+), 175 deletions(-)
 create mode 100644 arch/ia64/kernel/fsyscall_gtod_data.h

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9a541486fb7e..68115d7b0a7a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1154,6 +1154,8 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	nointroute	[IA-64]
 
+	nojitter	[IA64] Disables jitter checking for ITC timers.
+
 	nolapic		[IA-32,APIC] Do not enable or use the local APIC.
 
 	nolapic_timer	[IA-32,APIC] Do not use the local APIC timer.
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 616c96e73483..36c7b9682aa6 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -62,7 +62,11 @@ config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 
-config TIME_INTERPOLATION
+config GENERIC_TIME
+	bool
+	default y
+
+config GENERIC_TIME_VSYSCALL
 	bool
 	default y
 
diff --git a/arch/ia64/configs/bigsur_defconfig b/arch/ia64/configs/bigsur_defconfig
index 90e9c2e61bf4..9eb48c0927b0 100644
--- a/arch/ia64/configs/bigsur_defconfig
+++ b/arch/ia64/configs/bigsur_defconfig
@@ -85,7 +85,7 @@ CONFIG_MMU=y
 CONFIG_SWIOTLB=y
 CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_TIME_INTERPOLATION=y
+CONFIG_GENERIC_TIME=y
 CONFIG_EFI=y
 CONFIG_GENERIC_IOMAP=y
 CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
diff --git a/arch/ia64/configs/gensparse_defconfig b/arch/ia64/configs/gensparse_defconfig
index 0d29aa2066b3..3a9ed951db08 100644
--- a/arch/ia64/configs/gensparse_defconfig
+++ b/arch/ia64/configs/gensparse_defconfig
@@ -86,7 +86,7 @@ CONFIG_MMU=y
 CONFIG_SWIOTLB=y
 CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_TIME_INTERPOLATION=y
+CONFIG_GENERIC_TIME=y
 CONFIG_EFI=y
 CONFIG_GENERIC_IOMAP=y
 CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
diff --git a/arch/ia64/configs/sim_defconfig b/arch/ia64/configs/sim_defconfig
index d9146c31ea13..c420d9f3df98 100644
--- a/arch/ia64/configs/sim_defconfig
+++ b/arch/ia64/configs/sim_defconfig
@@ -86,7 +86,7 @@ CONFIG_MMU=y
 CONFIG_SWIOTLB=y
 CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_TIME_INTERPOLATION=y
+CONFIG_GENERIC_TIME=y
 CONFIG_EFI=y
 CONFIG_GENERIC_IOMAP=y
 CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
diff --git a/arch/ia64/configs/sn2_defconfig b/arch/ia64/configs/sn2_defconfig
index 64e951de4e57..4c9ffc47bc7a 100644
--- a/arch/ia64/configs/sn2_defconfig
+++ b/arch/ia64/configs/sn2_defconfig
@@ -93,7 +93,7 @@ CONFIG_SWIOTLB=y
 CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 CONFIG_GENERIC_FIND_NEXT_BIT=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_TIME_INTERPOLATION=y
+CONFIG_GENERIC_TIME=y
 CONFIG_DMI=y
 CONFIG_EFI=y
 CONFIG_GENERIC_IOMAP=y
diff --git a/arch/ia64/configs/tiger_defconfig b/arch/ia64/configs/tiger_defconfig
index a1446931b401..e208747d5de4 100644
--- a/arch/ia64/configs/tiger_defconfig
+++ b/arch/ia64/configs/tiger_defconfig
@@ -98,7 +98,7 @@ CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 # CONFIG_ARCH_HAS_ILOG2_U64 is not set
 CONFIG_GENERIC_FIND_NEXT_BIT=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_TIME_INTERPOLATION=y
+CONFIG_GENERIC_TIME=y
 CONFIG_DMI=y
 CONFIG_EFI=y
 CONFIG_GENERIC_IOMAP=y
diff --git a/arch/ia64/configs/zx1_defconfig b/arch/ia64/configs/zx1_defconfig
index 1c7955c16358..4a060fc39934 100644
--- a/arch/ia64/configs/zx1_defconfig
+++ b/arch/ia64/configs/zx1_defconfig
@@ -96,7 +96,7 @@ CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 # CONFIG_ARCH_HAS_ILOG2_U64 is not set
 CONFIG_GENERIC_FIND_NEXT_BIT=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_TIME_INTERPOLATION=y
+CONFIG_GENERIC_TIME=y
 CONFIG_DMI=y
 CONFIG_EFI=y
 CONFIG_GENERIC_IOMAP=y
diff --git a/arch/ia64/defconfig b/arch/ia64/defconfig
index 90bd9601cdde..461f8ee738fb 100644
--- a/arch/ia64/defconfig
+++ b/arch/ia64/defconfig
@@ -98,7 +98,7 @@ CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 # CONFIG_ARCH_HAS_ILOG2_U64 is not set
 CONFIG_GENERIC_FIND_NEXT_BIT=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_TIME_INTERPOLATION=y
+CONFIG_GENERIC_TIME=y
 CONFIG_DMI=y
 CONFIG_EFI=y
 CONFIG_GENERIC_IOMAP=y
diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
index 2236fabbb3c6..0aebc6f79e95 100644
--- a/arch/ia64/kernel/asm-offsets.c
+++ b/arch/ia64/kernel/asm-offsets.c
@@ -7,6 +7,7 @@
 #define ASM_OFFSETS_C 1
 
 #include <linux/sched.h>
+#include <linux/clocksource.h>
 
 #include <asm-ia64/processor.h>
 #include <asm-ia64/ptrace.h>
@@ -15,6 +16,7 @@
 #include <asm-ia64/mca.h>
 
 #include "../kernel/sigframe.h"
+#include "../kernel/fsyscall_gtod_data.h"
 
 #define DEFINE(sym, val) \
         asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -256,17 +258,24 @@ void foo(void)
 	BLANK();
 
 	/* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */
-	DEFINE(IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET, offsetof (struct time_interpolator, addr));
-	DEFINE(IA64_TIME_INTERPOLATOR_SOURCE_OFFSET, offsetof (struct time_interpolator, source));
-	DEFINE(IA64_TIME_INTERPOLATOR_SHIFT_OFFSET, offsetof (struct time_interpolator, shift));
-	DEFINE(IA64_TIME_INTERPOLATOR_NSEC_OFFSET, offsetof (struct time_interpolator, nsec_per_cyc));
-	DEFINE(IA64_TIME_INTERPOLATOR_OFFSET_OFFSET, offsetof (struct time_interpolator, offset));
-	DEFINE(IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET, offsetof (struct time_interpolator, last_cycle));
-	DEFINE(IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET, offsetof (struct time_interpolator, last_counter));
-	DEFINE(IA64_TIME_INTERPOLATOR_JITTER_OFFSET, offsetof (struct time_interpolator, jitter));
-	DEFINE(IA64_TIME_INTERPOLATOR_MASK_OFFSET, offsetof (struct time_interpolator, mask));
-	DEFINE(IA64_TIME_SOURCE_CPU, TIME_SOURCE_CPU);
-	DEFINE(IA64_TIME_SOURCE_MMIO64, TIME_SOURCE_MMIO64);
-	DEFINE(IA64_TIME_SOURCE_MMIO32, TIME_SOURCE_MMIO32);
-	DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec));
+	DEFINE(IA64_GTOD_LOCK_OFFSET,
+		offsetof (struct fsyscall_gtod_data_t, lock));
+	DEFINE(IA64_GTOD_WALL_TIME_OFFSET,
+		offsetof (struct fsyscall_gtod_data_t, wall_time));
+	DEFINE(IA64_GTOD_MONO_TIME_OFFSET,
+		offsetof (struct fsyscall_gtod_data_t, monotonic_time));
+	DEFINE(IA64_CLKSRC_MASK_OFFSET,
+		offsetof (struct fsyscall_gtod_data_t, clk_mask));
+	DEFINE(IA64_CLKSRC_MULT_OFFSET,
+		offsetof (struct fsyscall_gtod_data_t, clk_mult));
+	DEFINE(IA64_CLKSRC_SHIFT_OFFSET,
+		offsetof (struct fsyscall_gtod_data_t, clk_shift));
+	DEFINE(IA64_CLKSRC_MMIO_OFFSET,
+		offsetof (struct fsyscall_gtod_data_t, clk_fsys_mmio));
+	DEFINE(IA64_CLKSRC_CYCLE_LAST_OFFSET,
+		offsetof (struct fsyscall_gtod_data_t, clk_cycle_last));
+	DEFINE(IA64_ITC_JITTER_OFFSET,
+		offsetof (struct itc_jitter_data_t, itc_jitter));
+	DEFINE(IA64_ITC_LASTCYCLE_OFFSET,
+		offsetof (struct itc_jitter_data_t, itc_lastcycle));
 }
diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
index e00b21514f7c..2fd96d9062a1 100644
--- a/arch/ia64/kernel/cyclone.c
+++ b/arch/ia64/kernel/cyclone.c
@@ -3,6 +3,7 @@
 #include <linux/time.h>
 #include <linux/errno.h>
 #include <linux/timex.h>
+#include <linux/clocksource.h>
 #include <asm/io.h>
 
 /* IBM Summit (EXA) Cyclone counter code*/
@@ -18,13 +19,21 @@ void __init cyclone_setup(void)
 	use_cyclone = 1;
 }
 
+static void __iomem *cyclone_mc;
 
-struct time_interpolator cyclone_interpolator = {
-	.source =	TIME_SOURCE_MMIO64,
-	.shift =	16,
-	.frequency =	CYCLONE_TIMER_FREQ,
-	.drift =	-100,
-	.mask =		(1LL << 40) - 1
+static cycle_t read_cyclone(void)
+{
+	return (cycle_t)readq((void __iomem *)cyclone_mc);
+}
+
+static struct clocksource clocksource_cyclone = {
+        .name           = "cyclone",
+        .rating         = 300,
+        .read           = read_cyclone,
+        .mask           = (1LL << 40) - 1,
+        .mult           = 0, /*to be caluclated*/
+        .shift          = 16,
+        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
 int __init init_cyclone_clock(void)
@@ -44,13 +53,15 @@ int __init init_cyclone_clock(void)
 	offset = (CYCLONE_CBAR_ADDR);
 	reg = (u64*)ioremap_nocache(offset, sizeof(u64));
 	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n");
+		printk(KERN_ERR "Summit chipset: Could not find valid CBAR"
+				" register.\n");
 		use_cyclone = 0;
 		return -ENODEV;
 	}
 	base = readq(reg);
 	if(!base){
-		printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n");
+		printk(KERN_ERR "Summit chipset: Could not find valid CBAR"
+				" value.\n");
 		use_cyclone = 0;
 		return -ENODEV;
 	}
@@ -60,7 +71,8 @@ int __init init_cyclone_clock(void)
 	offset = (base + CYCLONE_PMCC_OFFSET);
 	reg = (u64*)ioremap_nocache(offset, sizeof(u64));
 	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n");
+		printk(KERN_ERR "Summit chipset: Could not find valid PMCC"
+				" register.\n");
 		use_cyclone = 0;
 		return -ENODEV;
 	}
@@ -71,7 +83,8 @@ int __init init_cyclone_clock(void)
 	offset = (base + CYCLONE_MPCS_OFFSET);
 	reg = (u64*)ioremap_nocache(offset, sizeof(u64));
 	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n");
+		printk(KERN_ERR "Summit chipset: Could not find valid MPCS"
+				" register.\n");
 		use_cyclone = 0;
 		return -ENODEV;
 	}
@@ -82,7 +95,8 @@ int __init init_cyclone_clock(void)
 	offset = (base + CYCLONE_MPMC_OFFSET);
 	cyclone_timer = (u32*)ioremap_nocache(offset, sizeof(u32));
 	if(!cyclone_timer){
-		printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n");
+		printk(KERN_ERR "Summit chipset: Could not find valid MPMC"
+				" register.\n");
 		use_cyclone = 0;
 		return -ENODEV;
 	}
@@ -93,7 +107,8 @@ int __init init_cyclone_clock(void)
 		int stall = 100;
 		while(stall--) barrier();
 		if(readl(cyclone_timer) == old){
-			printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n");
+			printk(KERN_ERR "Summit chipset: Counter not counting!"
+					" DISABLED\n");
 			iounmap(cyclone_timer);
 			cyclone_timer = 0;
 			use_cyclone = 0;
@@ -101,8 +116,11 @@ int __init init_cyclone_clock(void)
 		}
 	}
 	/* initialize last tick */
-	cyclone_interpolator.addr = cyclone_timer;
-	register_time_interpolator(&cyclone_interpolator);
+	cyclone_mc = cyclone_timer;
+	clocksource_cyclone.fsys_mmio = cyclone_timer;
+	clocksource_cyclone.mult = clocksource_hz2mult(CYCLONE_TIMER_FREQ,
+						clocksource_cyclone.shift);
+	clocksource_register(&clocksource_cyclone);
 
 	return 0;
 }
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
index 3f926c2dc708..44841971f077 100644
--- a/arch/ia64/kernel/fsys.S
+++ b/arch/ia64/kernel/fsys.S
@@ -147,12 +147,11 @@ ENTRY(fsys_set_tid_address)
 	FSYS_RETURN
 END(fsys_set_tid_address)
 
-/*
- * Ensure that the time interpolator structure is compatible with the asm code
- */
-#if IA64_TIME_INTERPOLATOR_SOURCE_OFFSET !=0 || IA64_TIME_INTERPOLATOR_SHIFT_OFFSET != 2 \
-	|| IA64_TIME_INTERPOLATOR_JITTER_OFFSET != 3 || IA64_TIME_INTERPOLATOR_NSEC_OFFSET != 4
-#error fsys_gettimeofday incompatible with changes to struct time_interpolator
+#if IA64_GTOD_LOCK_OFFSET !=0
+#error fsys_gettimeofday incompatible with changes to struct fsyscall_gtod_data_t
+#endif
+#if IA64_ITC_JITTER_OFFSET !=0
+#error fsys_gettimeofday incompatible with changes to struct itc_jitter_data_t
 #endif
 #define CLOCK_REALTIME 0
 #define CLOCK_MONOTONIC 1
@@ -179,126 +178,124 @@ ENTRY(fsys_gettimeofday)
 	// r11 = preserved: saved ar.pfs
 	// r12 = preserved: memory stack
 	// r13 = preserved: thread pointer
-	// r14 = address of mask / mask
+	// r14 = address of mask / mask value
 	// r15 = preserved: system call number
 	// r16 = preserved: current task pointer
-	// r17 = wall to monotonic use
-	// r18 = time_interpolator->offset
-	// r19 = address of wall_to_monotonic
-	// r20 = pointer to struct time_interpolator / pointer to time_interpolator->address
-	// r21 = shift factor
-	// r22 = address of time interpolator->last_counter
-	// r23 = address of time_interpolator->last_cycle
-	// r24 = adress of time_interpolator->offset
-	// r25 = last_cycle value
-	// r26 = last_counter value
-	// r27 = pointer to xtime
+	// r17 = (not used)
+	// r18 = (not used)
+	// r19 = address of itc_lastcycle
+	// r20 = struct fsyscall_gtod_data (= address of gtod_lock.sequence)
+	// r21 = address of mmio_ptr
+	// r22 = address of wall_time or monotonic_time
+	// r23 = address of shift / value
+	// r24 = address mult factor / cycle_last value
+	// r25 = itc_lastcycle value
+	// r26 = address clocksource cycle_last
+	// r27 = (not used)
 	// r28 = sequence number at the beginning of critcal section
-	// r29 = address of seqlock
+	// r29 = address of itc_jitter
 	// r30 = time processing flags / memory address
 	// r31 = pointer to result
 	// Predicates
 	// p6,p7 short term use
 	// p8 = timesource ar.itc
 	// p9 = timesource mmio64
-	// p10 = timesource mmio32
+	// p10 = timesource mmio32 - not used
 	// p11 = timesource not to be handled by asm code
-	// p12 = memory time source ( = p9 | p10)
-	// p13 = do cmpxchg with time_interpolator_last_cycle
+	// p12 = memory time source ( = p9 | p10) - not used
+	// p13 = do cmpxchg with itc_lastcycle
 	// p14 = Divide by 1000
 	// p15 = Add monotonic
 	//
-	// Note that instructions are optimized for McKinley. McKinley can process two
-	// bundles simultaneously and therefore we continuously try to feed the CPU
-	// two bundles and then a stop.
-	tnat.nz p6,p0 = r31	// branch deferred since it does not fit into bundle structure
+	// Note that instructions are optimized for McKinley. McKinley can
+	// process two bundles simultaneously and therefore we continuously
+	// try to feed the CPU two bundles and then a stop.
+	//
+	// Additional note that code has changed a lot. Optimization is TBD.
+	// Comments begin with "?" are maybe outdated.
+	tnat.nz p6,p0 = r31	// ? branch deferred to fit later bundle
 	mov pr = r30,0xc000	// Set predicates according to function
 	add r2 = TI_FLAGS+IA64_TASK_SIZE,r16
-	movl r20 = time_interpolator
+	movl r20 = fsyscall_gtod_data // load fsyscall gettimeofday data address
 	;;
-	ld8 r20 = [r20]		// get pointer to time_interpolator structure
-	movl r29 = xtime_lock
+	movl r29 = itc_jitter_data	// itc_jitter
+	add r22 = IA64_GTOD_WALL_TIME_OFFSET,r20	// wall_time
 	ld4 r2 = [r2]		// process work pending flags
-	movl r27 = xtime
-	;;	// only one bundle here
-	ld8 r21 = [r20]		// first quad with control information
+	;;
+(p15)	add r22 = IA64_GTOD_MONO_TIME_OFFSET,r20	// monotonic_time
+	add r21 = IA64_CLKSRC_MMIO_OFFSET,r20
+	add r19 = IA64_ITC_LASTCYCLE_OFFSET,r29
 	and r2 = TIF_ALLWORK_MASK,r2
-(p6)    br.cond.spnt.few .fail_einval	// deferred branch
+(p6)    br.cond.spnt.few .fail_einval	// ? deferred branch
 	;;
-	add r10 = IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET,r20
-	extr r3 = r21,32,32	// time_interpolator->nsec_per_cyc
-	extr r8 = r21,0,16	// time_interpolator->source
+	add r26 = IA64_CLKSRC_CYCLE_LAST_OFFSET,r20 // clksrc_cycle_last
 	cmp.ne p6, p0 = 0, r2	// Fallback if work is scheduled
 (p6)    br.cond.spnt.many fsys_fallback_syscall
 	;;
-	cmp.eq p8,p12 = 0,r8	// Check for cpu timer
-	cmp.eq p9,p0 = 1,r8	// MMIO64 ?
-	extr r2 = r21,24,8	// time_interpolator->jitter
-	cmp.eq p10,p0 = 2,r8	// MMIO32 ?
-	cmp.ltu p11,p0 = 2,r8	// function or other clock
-(p11)	br.cond.spnt.many fsys_fallback_syscall
+	// Begin critical section
+.time_redo:
+	ld4.acq r28 = [r20]	// gtod_lock.sequence, Must take first
+	;;
+	and r28 = ~1,r28	// And make sequence even to force retry if odd
 	;;
-	setf.sig f7 = r3	// Setup for scaling of counter
-(p15)	movl r19 = wall_to_monotonic
-(p12)	ld8 r30 = [r10]
-	cmp.ne p13,p0 = r2,r0	// need jitter compensation?
-	extr r21 = r21,16,8	// shift factor
+	ld8 r30 = [r21]		// clocksource->mmio_ptr
+	add r24 = IA64_CLKSRC_MULT_OFFSET,r20
+	ld4 r2 = [r29]		// itc_jitter value
+	add r23 = IA64_CLKSRC_SHIFT_OFFSET,r20
+	add r14 = IA64_CLKSRC_MASK_OFFSET,r20
 	;;
-.time_redo:
-	.pred.rel.mutex p8,p9,p10
-	ld4.acq r28 = [r29]	// xtime_lock.sequence. Must come first for locking purposes
+	ld4 r3 = [r24]		// clocksource mult value
+	ld8 r14 = [r14]         // clocksource mask value
+	cmp.eq p8,p9 = 0,r30	// use cpu timer if no mmio_ptr
 	;;
-	and r28 = ~1,r28	// Make sequence even to force retry if odd
+	setf.sig f7 = r3	// Setup for mult scaling of counter
+(p8)	cmp.ne p13,p0 = r2,r0	// need itc_jitter compensation, set p13
+	ld4 r23 = [r23]		// clocksource shift value
+	ld8 r24 = [r26]		// get clksrc_cycle_last value
+(p9)	cmp.eq p13,p0 = 0,r30	// if mmio_ptr, clear p13 jitter control
 	;;
+	.pred.rel.mutex p8,p9
 (p8)	mov r2 = ar.itc		// CPU_TIMER. 36 clocks latency!!!
-	add r22 = IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET,r20
-(p9)	ld8 r2 = [r30]		// readq(ti->address). Could also have latency issues..
-(p10)	ld4 r2 = [r30]		// readw(ti->address)
-(p13)	add r23 = IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET,r20
-	;;			// could be removed by moving the last add upward
-	ld8 r26 = [r22]		// time_interpolator->last_counter
-(p13)	ld8 r25 = [r23]		// time interpolator->last_cycle
-	add r24 = IA64_TIME_INTERPOLATOR_OFFSET_OFFSET,r20
-(p15)	ld8 r17 = [r19],IA64_TIMESPEC_TV_NSEC_OFFSET
- 	ld8 r9 = [r27],IA64_TIMESPEC_TV_NSEC_OFFSET
-	add r14 = IA64_TIME_INTERPOLATOR_MASK_OFFSET, r20
-	;;
-	ld8 r18 = [r24]		// time_interpolator->offset
-	ld8 r8 = [r27],-IA64_TIMESPEC_TV_NSEC_OFFSET	// xtime.tv_nsec
-(p13)	sub r3 = r25,r2	// Diff needed before comparison (thanks davidm)
-	;;
-	ld8 r14 = [r14]		// time_interpolator->mask
-(p13)	cmp.gt.unc p6,p7 = r3,r0	// check if it is less than last. p6,p7 cleared
-	sub r10 = r2,r26	// current_counter - last_counter
-	;;
-(p6)	sub r10 = r25,r26	// time we got was less than last_cycle
+(p9)	ld8 r2 = [r30]		// MMIO_TIMER. Could also have latency issues..
+(p13)	ld8 r25 = [r19]		// get itc_lastcycle value
+	;;		// ? could be removed by moving the last add upward
+	ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET	// tv_sec
+	;;
+	ld8 r8 = [r22],-IA64_TIMESPEC_TV_NSEC_OFFSET	// tv_nsec
+(p13)	sub r3 = r25,r2		// Diff needed before comparison (thanks davidm)
+	;;
+(p13)	cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared
+	sub r10 = r2,r24	// current_cycle - last_cycle
+	;;
+(p6)	sub r10 = r25,r24	// time we got was less than last_cycle
 (p7)	mov ar.ccv = r25	// more than last_cycle. Prep for cmpxchg
 	;;
+(p7)	cmpxchg8.rel r3 = [r19],r2,ar.ccv
+	;;
+(p7)	cmp.ne p7,p0 = r25,r3	// if cmpxchg not successful
+	;;
+(p7)	sub r10 = r3,r24	// then use new last_cycle instead
+	;;
 	and r10 = r10,r14	// Apply mask
 	;;
 	setf.sig f8 = r10
 	nop.i 123
 	;;
-(p7)	cmpxchg8.rel r3 = [r23],r2,ar.ccv
-EX(.fail_efault, probe.w.fault r31, 3)	// This takes 5 cycles and we have spare time
+	// fault check takes 5 cycles and we have spare time
+EX(.fail_efault, probe.w.fault r31, 3)
 	xmpy.l f8 = f8,f7	// nsec_per_cyc*(counter-last_counter)
-(p15)	add r9 = r9,r17		// Add wall to monotonic.secs to result secs
 	;;
-(p15)	ld8 r17 = [r19],-IA64_TIMESPEC_TV_NSEC_OFFSET
-(p7)	cmp.ne p7,p0 = r25,r3	// if cmpxchg not successful redo
-	// simulate tbit.nz.or p7,p0 = r28,0
+	// ? simulate tbit.nz.or p7,p0 = r28,0
 	getf.sig r2 = f8
 	mf
-	add r8 = r8,r18		// Add time interpolator offset
 	;;
-	ld4 r10 = [r29]		// xtime_lock.sequence
-(p15)	add r8 = r8, r17	// Add monotonic.nsecs to nsecs
-	shr.u r2 = r2,r21
-	;;		// overloaded 3 bundles!
-	// End critical section.
+	ld4 r10 = [r20]		// gtod_lock.sequence
+	shr.u r2 = r2,r23	// shift by factor
+	;;		// ? overloaded 3 bundles!
 	add r8 = r8,r2		// Add xtime.nsecs
-	cmp4.ne.or p7,p0 = r28,r10
-(p7)	br.cond.dpnt.few .time_redo	// sequence number changed ?
+	cmp4.ne p7,p0 = r28,r10
+(p7)	br.cond.dpnt.few .time_redo	// sequence number changed, redo
+	// End critical section.
 	// Now r8=tv->tv_nsec and r9=tv->tv_sec
 	mov r10 = r0
 	movl r2 = 1000000000
@@ -308,19 +305,19 @@ EX(.fail_efault, probe.w.fault r31, 3)	// This takes 5 cycles and we have spare
 .time_normalize:
 	mov r21 = r8
 	cmp.ge p6,p0 = r8,r2
-(p14)	shr.u r20 = r8, 3		// We can repeat this if necessary just wasting some time
+(p14)	shr.u r20 = r8, 3 // We can repeat this if necessary just wasting time
 	;;
 (p14)	setf.sig f8 = r20
 (p6)	sub r8 = r8,r2
-(p6)	add r9 = 1,r9			// two nops before the branch.
-(p14)	setf.sig f7 = r3		// Chances for repeats are 1 in 10000 for gettod
+(p6)	add r9 = 1,r9		// two nops before the branch.
+(p14)	setf.sig f7 = r3	// Chances for repeats are 1 in 10000 for gettod
 (p6)	br.cond.dpnt.few .time_normalize
 	;;
 	// Divided by 8 though shift. Now divide by 125
 	// The compiler was able to do that with a multiply
 	// and a shift and we do the same
-EX(.fail_efault, probe.w.fault r23, 3)		// This also costs 5 cycles
-(p14)	xmpy.hu f8 = f8, f7			// xmpy has 5 cycles latency so use it...
+EX(.fail_efault, probe.w.fault r23, 3)	// This also costs 5 cycles
+(p14)	xmpy.hu f8 = f8, f7		// xmpy has 5 cycles latency so use it
 	;;
 	mov r8 = r0
 (p14)	getf.sig r2 = f8
diff --git a/arch/ia64/kernel/fsyscall_gtod_data.h b/arch/ia64/kernel/fsyscall_gtod_data.h
new file mode 100644
index 000000000000..490dab55fba3
--- /dev/null
+++ b/arch/ia64/kernel/fsyscall_gtod_data.h
@@ -0,0 +1,23 @@
+/*
+ * (c) Copyright 2007 Hewlett-Packard Development Company, L.P.
+ *        Contributed by Peter Keilty <peter.keilty@hp.com>
+ *
+ * fsyscall gettimeofday data
+ */
+
+struct fsyscall_gtod_data_t {
+	seqlock_t	lock;
+	struct timespec	wall_time;
+	struct timespec monotonic_time;
+	cycle_t		clk_mask;
+	u32		clk_mult;
+	u32		clk_shift;
+	void		*clk_fsys_mmio;
+	cycle_t		clk_cycle_last;
+} __attribute__ ((aligned (L1_CACHE_BYTES)));
+
+struct itc_jitter_data_t {
+	int		itc_jitter;
+	cycle_t		itc_lastcycle;
+} __attribute__ ((aligned (L1_CACHE_BYTES)));
+
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 3486fe7d6e65..627785c48ea9 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -19,6 +19,7 @@
 #include <linux/interrupt.h>
 #include <linux/efi.h>
 #include <linux/timex.h>
+#include <linux/clocksource.h>
 
 #include <asm/machvec.h>
 #include <asm/delay.h>
@@ -28,6 +29,16 @@
 #include <asm/sections.h>
 #include <asm/system.h>
 
+#include "fsyscall_gtod_data.h"
+
+static cycle_t itc_get_cycles(void);
+
+struct fsyscall_gtod_data_t fsyscall_gtod_data = {
+	.lock = SEQLOCK_UNLOCKED,
+};
+
+struct itc_jitter_data_t itc_jitter_data;
+
 volatile int time_keeper_id = 0; /* smp_processor_id() of time-keeper */
 
 #ifdef CONFIG_IA64_DEBUG_IRQ
@@ -37,11 +48,16 @@ EXPORT_SYMBOL(last_cli_ip);
 
 #endif
 
-static struct time_interpolator itc_interpolator = {
-	.shift = 16,
-	.mask = 0xffffffffffffffffLL,
-	.source = TIME_SOURCE_CPU
+static struct clocksource clocksource_itc = {
+        .name           = "itc",
+        .rating         = 350,
+        .read           = itc_get_cycles,
+        .mask           = 0xffffffffffffffff,
+        .mult           = 0, /*to be caluclated*/
+        .shift          = 16,
+        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
 };
+static struct clocksource *itc_clocksource;
 
 static irqreturn_t
 timer_interrupt (int irq, void *dev_id)
@@ -210,8 +226,6 @@ ia64_init_itm (void)
 					+ itc_freq/2)/itc_freq;
 
 	if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) {
-		itc_interpolator.frequency = local_cpu_data->itc_freq;
-		itc_interpolator.drift = itc_drift;
 #ifdef CONFIG_SMP
 		/* On IA64 in an SMP configuration ITCs are never accurately synchronized.
 		 * Jitter compensation requires a cmpxchg which may limit
@@ -223,15 +237,50 @@ ia64_init_itm (void)
 		 * even going backward) if the ITC offsets between the individual CPUs
 		 * are too large.
 		 */
-		if (!nojitter) itc_interpolator.jitter = 1;
+		if (!nojitter)
+			itc_jitter_data.itc_jitter = 1;
 #endif
-		register_time_interpolator(&itc_interpolator);
 	}
 
 	/* Setup the CPU local timer tick */
 	ia64_cpu_local_tick();
+
+	if (!itc_clocksource) {
+		/* Sort out mult/shift values: */
+		clocksource_itc.mult =
+			clocksource_hz2mult(local_cpu_data->itc_freq,
+						clocksource_itc.shift);
+		clocksource_register(&clocksource_itc);
+		itc_clocksource = &clocksource_itc;
+	}
 }
 
+static cycle_t itc_get_cycles()
+{
+	u64 lcycle, now, ret;
+
+	if (!itc_jitter_data.itc_jitter)
+		return get_cycles();
+
+	lcycle = itc_jitter_data.itc_lastcycle;
+	now = get_cycles();
+	if (lcycle && time_after(lcycle, now))
+		return lcycle;
+
+	/*
+	 * Keep track of the last timer value returned.
+	 * In an SMP environment, you could lose out in contention of
+	 * cmpxchg. If so, your cmpxchg returns new value which the
+	 * winner of contention updated to. Use the new value instead.
+	 */
+	ret = cmpxchg(&itc_jitter_data.itc_lastcycle, lcycle, now);
+	if (unlikely(ret != lcycle))
+		return ret;
+
+	return now;
+}
+
+
 static struct irqaction timer_irqaction = {
 	.handler =	timer_interrupt,
 	.flags =	IRQF_DISABLED | IRQF_IRQPOLL,
@@ -307,3 +356,34 @@ ia64_setup_printk_clock(void)
 	if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT))
 		ia64_printk_clock = ia64_itc_printk_clock;
 }
+
+void update_vsyscall(struct timespec *wall, struct clocksource *c)
+{
+        unsigned long flags;
+
+        write_seqlock_irqsave(&fsyscall_gtod_data.lock, flags);
+
+        /* copy fsyscall clock data */
+        fsyscall_gtod_data.clk_mask = c->mask;
+        fsyscall_gtod_data.clk_mult = c->mult;
+        fsyscall_gtod_data.clk_shift = c->shift;
+        fsyscall_gtod_data.clk_fsys_mmio = c->fsys_mmio;
+        fsyscall_gtod_data.clk_cycle_last = c->cycle_last;
+
+	/* copy kernel time structures */
+        fsyscall_gtod_data.wall_time.tv_sec = wall->tv_sec;
+        fsyscall_gtod_data.wall_time.tv_nsec = wall->tv_nsec;
+        fsyscall_gtod_data.monotonic_time.tv_sec = wall_to_monotonic.tv_sec
+							+ wall->tv_sec;
+        fsyscall_gtod_data.monotonic_time.tv_nsec = wall_to_monotonic.tv_nsec
+							+ wall->tv_nsec;
+
+	/* normalize */
+	while (fsyscall_gtod_data.monotonic_time.tv_nsec >= NSEC_PER_SEC) {
+		fsyscall_gtod_data.monotonic_time.tv_nsec -= NSEC_PER_SEC;
+		fsyscall_gtod_data.monotonic_time.tv_sec++;
+	}
+
+        write_sequnlock_irqrestore(&fsyscall_gtod_data.lock, flags);
+}
+
diff --git a/arch/ia64/sn/kernel/sn2/timer.c b/arch/ia64/sn/kernel/sn2/timer.c
index 56a88b6df4b4..19e25d2b64fc 100644
--- a/arch/ia64/sn/kernel/sn2/timer.c
+++ b/arch/ia64/sn/kernel/sn2/timer.c
@@ -11,6 +11,7 @@
 #include <linux/sched.h>
 #include <linux/time.h>
 #include <linux/interrupt.h>
+#include <linux/clocksource.h>
 
 #include <asm/hw_irq.h>
 #include <asm/system.h>
@@ -22,11 +23,21 @@
 
 extern unsigned long sn_rtc_cycles_per_second;
 
-static struct time_interpolator sn2_interpolator = {
-	.drift = -1,
-	.shift = 10,
-	.mask = (1LL << 55) - 1,
-	.source = TIME_SOURCE_MMIO64
+static void __iomem *sn2_mc;
+
+static cycle_t read_sn2(void)
+{
+	return (cycle_t)readq(sn2_mc);
+}
+
+static struct clocksource clocksource_sn2 = {
+        .name           = "sn2_rtc",
+        .rating         = 300,
+        .read           = read_sn2,
+        .mask           = (1LL << 55) - 1,
+        .mult           = 0,
+        .shift          = 10,
+        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
 /*
@@ -47,9 +58,11 @@ ia64_sn_udelay (unsigned long usecs)
 
 void __init sn_timer_init(void)
 {
-	sn2_interpolator.frequency = sn_rtc_cycles_per_second;
-	sn2_interpolator.addr = RTC_COUNTER_ADDR;
-	register_time_interpolator(&sn2_interpolator);
+	sn2_mc = RTC_COUNTER_ADDR;
+	clocksource_sn2.fsys_mmio = RTC_COUNTER_ADDR;
+	clocksource_sn2.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
+							clocksource_sn2.shift);
+	clocksource_register(&clocksource_sn2);
 
 	ia64_udelay = &ia64_sn_udelay;
 }
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 80ffc7829916..bb5d23be4260 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -475,7 +475,7 @@ static void acpi_processor_idle(void)
 		/* Get end time (ticks) */
 		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
 
-#ifdef CONFIG_GENERIC_TIME
+#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
 		/* TSC halts in C2, so notify users */
 		mark_tsc_unstable("possible TSC halt in C2");
 #endif
@@ -517,7 +517,7 @@ static void acpi_processor_idle(void)
 			acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
 		}
 
-#ifdef CONFIG_GENERIC_TIME
+#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
 		/* TSC halts in C3, so notify users */
 		mark_tsc_unstable("TSC halts in C3");
 #endif
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 0be700f4e8fd..ba0e74ad74bb 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -29,6 +29,7 @@
 #include <linux/bcd.h>
 #include <linux/seq_file.h>
 #include <linux/bitops.h>
+#include <linux/clocksource.h>
 
 #include <asm/current.h>
 #include <asm/uaccess.h>
@@ -51,8 +52,34 @@
 
 #define HPET_RANGE_SIZE		1024	/* from HPET spec */
 
+#if BITS_PER_LONG == 64
+#define	write_counter(V, MC)	writeq(V, MC)
+#define	read_counter(MC)	readq(MC)
+#else
+#define	write_counter(V, MC)	writel(V, MC)
+#define	read_counter(MC)	readl(MC)
+#endif
+
 static u32 hpet_nhpet, hpet_max_freq = HPET_USER_FREQ;
 
+static void __iomem *hpet_mctr;
+
+static cycle_t read_hpet(void)
+{
+	return (cycle_t)read_counter((void __iomem *)hpet_mctr);
+}
+
+static struct clocksource clocksource_hpet = {
+        .name           = "hpet",
+        .rating         = 250,
+        .read           = read_hpet,
+        .mask           = 0xffffffffffffffff,
+        .mult           = 0, /*to be caluclated*/
+        .shift          = 10,
+        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+static struct clocksource *hpet_clocksource;
+
 /* A lock for concurrent access by app and isr hpet activity. */
 static DEFINE_SPINLOCK(hpet_lock);
 /* A lock for concurrent intermodule access to hpet and isr hpet activity. */
@@ -79,7 +106,7 @@ struct hpets {
 	struct hpets *hp_next;
 	struct hpet __iomem *hp_hpet;
 	unsigned long hp_hpet_phys;
-	struct time_interpolator *hp_interpolator;
+	struct clocksource *hp_clocksource;
 	unsigned long long hp_tick_freq;
 	unsigned long hp_delta;
 	unsigned int hp_ntimer;
@@ -94,13 +121,6 @@ static struct hpets *hpets;
 #define	HPET_PERIODIC		0x0004
 #define	HPET_SHARED_IRQ		0x0008
 
-#if BITS_PER_LONG == 64
-#define	write_counter(V, MC)	writeq(V, MC)
-#define	read_counter(MC)	readq(MC)
-#else
-#define	write_counter(V, MC) 	writel(V, MC)
-#define	read_counter(MC)	readl(MC)
-#endif
 
 #ifndef readq
 static inline unsigned long long readq(void __iomem *addr)
@@ -737,27 +757,6 @@ static ctl_table dev_root[] = {
 
 static struct ctl_table_header *sysctl_header;
 
-static void hpet_register_interpolator(struct hpets *hpetp)
-{
-#ifdef	CONFIG_TIME_INTERPOLATION
-	struct time_interpolator *ti;
-
-	ti = kzalloc(sizeof(*ti), GFP_KERNEL);
-	if (!ti)
-		return;
-
-	ti->source = TIME_SOURCE_MMIO64;
-	ti->shift = 10;
-	ti->addr = &hpetp->hp_hpet->hpet_mc;
-	ti->frequency = hpetp->hp_tick_freq;
-	ti->drift = HPET_DRIFT;
-	ti->mask = -1;
-
-	hpetp->hp_interpolator = ti;
-	register_time_interpolator(ti);
-#endif
-}
-
 /*
  * Adjustment for when arming the timer with
  * initial conditions.  That is, main counter
@@ -909,7 +908,16 @@ int hpet_alloc(struct hpet_data *hdp)
 	}
 
 	hpetp->hp_delta = hpet_calibrate(hpetp);
-	hpet_register_interpolator(hpetp);
+
+	if (!hpet_clocksource) {
+		hpet_mctr = (void __iomem *)&hpetp->hp_hpet->hpet_mc;
+		CLKSRC_FSYS_MMIO_SET(clocksource_hpet.fsys_mmio, hpet_mctr);
+		clocksource_hpet.mult = clocksource_hz2mult(hpetp->hp_tick_freq,
+						clocksource_hpet.shift);
+		clocksource_register(&clocksource_hpet);
+		hpetp->hp_clocksource = &clocksource_hpet;
+		hpet_clocksource = &clocksource_hpet;
+	}
 
 	return 0;
 }
@@ -995,7 +1003,7 @@ static int hpet_acpi_add(struct acpi_device *device)
 
 static int hpet_acpi_remove(struct acpi_device *device, int type)
 {
-	/* XXX need to unregister interpolator, dealloc mem, etc */
+	/* XXX need to unregister clocksource, dealloc mem, etc */
 	return -EINVAL;
 }
 
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index bf297b03a4e4..16ea3374dddf 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -67,6 +67,12 @@ struct clocksource {
 	unsigned long flags;
 	cycle_t (*vread)(void);
 	void (*resume)(void);
+#ifdef CONFIG_IA64
+	void *fsys_mmio;        /* used by fsyscall asm code */
+#define CLKSRC_FSYS_MMIO_SET(mmio, addr)      ((mmio) = (addr))
+#else
+#define CLKSRC_FSYS_MMIO_SET(mmio, addr)      do { } while (0)
+#endif
 
 	/* timekeeping specific data, ignore */
 	cycle_t cycle_interval;
-- 
cgit v1.3-8-gc7d7


From 1f564ad6d4182859612cbae452122e5eb2d62a76 Mon Sep 17 00:00:00 2001
From: Bob Picco <bob.picco@hp.com>
Date: Wed, 18 Jul 2007 15:51:28 -0700
Subject: [IA64] remove time interpolator

Remove time_interpolator code (This is generic code, but
only user was ia64.  It has been superseded by the
CONFIG_GENERIC_TIME code).

Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Peter Keilty <peter.keilty@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 Documentation/time_interpolators.txt |  41 --------
 include/linux/timex.h                |  60 -----------
 kernel/time.c                        |  88 ----------------
 kernel/time/ntp.c                    |  10 --
 kernel/time/timekeeping.c            |   4 -
 kernel/timer.c                       | 188 -----------------------------------
 6 files changed, 391 deletions(-)
 delete mode 100644 Documentation/time_interpolators.txt

(limited to 'include/linux')

diff --git a/Documentation/time_interpolators.txt b/Documentation/time_interpolators.txt
deleted file mode 100644
index e3b60854fbc2..000000000000
--- a/Documentation/time_interpolators.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-Time Interpolators
-------------------
-
-Time interpolators are a base of time calculation between timer ticks and
-allow an accurate determination of time down to the accuracy of the time
-source in nanoseconds.
-
-The architecture specific code typically provides gettimeofday and
-settimeofday under Linux. The time interpolator provides both if an arch
-defines CONFIG_TIME_INTERPOLATION. The arch still must set up timer tick
-operations and call the necessary functions to advance the clock.
-
-With the time interpolator a standardized interface exists for time
-interpolation between ticks. The provided logic is highly scalable
-and has been tested in SMP situations of up to 512 CPUs.
-
-If CONFIG_TIME_INTERPOLATION is defined then the architecture specific code
-(or the device drivers - like HPET) may register time interpolators.
-These are typically defined in the following way:
-
-static struct time_interpolator my_interpolator {
-	.frequency = MY_FREQUENCY,
-	.source = TIME_SOURCE_MMIO32,
-	.shift = 8,		/* scaling for higher accuracy */
-	.drift = -1,		/* Unknown drift */
-	.jitter = 0		/* time source is stable */
-};
-
-void time_init(void)
-{
-	....
-	/* Initialization of the timer *.
-	my_interpolator.address = &my_timer;
-	register_time_interpolator(&my_interpolator);
-	....
-}
-
-For more details see include/linux/timex.h and kernel/timer.c.
-
-Christoph Lameter <christoph@lameter.com>, October 31, 2004
-
diff --git a/include/linux/timex.h b/include/linux/timex.h
index da929dbbea2a..37ac3ff90faf 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -224,66 +224,6 @@ static inline int ntp_synced(void)
 	__x < 0 ? -(-__x >> __s) : __x >> __s;	\
 })
 
-
-#ifdef CONFIG_TIME_INTERPOLATION
-
-#define TIME_SOURCE_CPU 0
-#define TIME_SOURCE_MMIO64 1
-#define TIME_SOURCE_MMIO32 2
-#define TIME_SOURCE_FUNCTION 3
-
-/* For proper operations time_interpolator clocks must run slightly slower
- * than the standard clock since the interpolator may only correct by having
- * time jump forward during a tick. A slower clock is usually a side effect
- * of the integer divide of the nanoseconds in a second by the frequency.
- * The accuracy of the division can be increased by specifying a shift.
- * However, this may cause the clock not to be slow enough.
- * The interpolator will self-tune the clock by slowing down if no
- * resets occur or speeding up if the time jumps per analysis cycle
- * become too high.
- *
- * Setting jitter compensates for a fluctuating timesource by comparing
- * to the last value read from the timesource to insure that an earlier value
- * is not returned by a later call. The price to pay
- * for the compensation is that the timer routines are not as scalable anymore.
- */
-
-struct time_interpolator {
-	u16 source;			/* time source flags */
-	u8 shift;			/* increases accuracy of multiply by shifting. */
-				/* Note that bits may be lost if shift is set too high */
-	u8 jitter;			/* if set compensate for fluctuations */
-	u32 nsec_per_cyc;		/* set by register_time_interpolator() */
-	void *addr;			/* address of counter or function */
-	cycles_t mask;			/* mask the valid bits of the counter */
-	unsigned long offset;		/* nsec offset at last update of interpolator */
-	u64 last_counter;		/* counter value in units of the counter at last update */
-	cycles_t last_cycle;		/* Last timer value if TIME_SOURCE_JITTER is set */
-	u64 frequency;			/* frequency in counts/second */
-	long drift;			/* drift in parts-per-million (or -1) */
-	unsigned long skips;		/* skips forward */
-	unsigned long ns_skipped;	/* nanoseconds skipped */
-	struct time_interpolator *next;
-};
-
-extern void register_time_interpolator(struct time_interpolator *);
-extern void unregister_time_interpolator(struct time_interpolator *);
-extern void time_interpolator_reset(void);
-extern unsigned long time_interpolator_get_offset(void);
-extern void time_interpolator_update(long delta_nsec);
-
-#else /* !CONFIG_TIME_INTERPOLATION */
-
-static inline void time_interpolator_reset(void)
-{
-}
-
-static inline void time_interpolator_update(long delta_nsec)
-{
-}
-
-#endif /* !CONFIG_TIME_INTERPOLATION */
-
 #define TICK_LENGTH_SHIFT	32
 
 #ifdef CONFIG_NO_HZ
diff --git a/kernel/time.c b/kernel/time.c
index ffe19149d770..e325597f5bf5 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -136,7 +136,6 @@ static inline void warp_clock(void)
 	write_seqlock_irq(&xtime_lock);
 	wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
 	xtime.tv_sec += sys_tz.tz_minuteswest * 60;
-	time_interpolator_reset();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 }
@@ -309,92 +308,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
 }
 EXPORT_SYMBOL(timespec_trunc);
 
-#ifdef CONFIG_TIME_INTERPOLATION
-void getnstimeofday (struct timespec *tv)
-{
-	unsigned long seq,sec,nsec;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		sec = xtime.tv_sec;
-		nsec = xtime.tv_nsec+time_interpolator_get_offset();
-	} while (unlikely(read_seqretry(&xtime_lock, seq)));
-
-	while (unlikely(nsec >= NSEC_PER_SEC)) {
-		nsec -= NSEC_PER_SEC;
-		++sec;
-	}
-	tv->tv_sec = sec;
-	tv->tv_nsec = nsec;
-}
-EXPORT_SYMBOL_GPL(getnstimeofday);
-
-int do_settimeofday (struct timespec *tv)
-{
-	time_t wtm_sec, sec = tv->tv_sec;
-	long wtm_nsec, nsec = tv->tv_nsec;
-
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irq(&xtime_lock);
-	{
-		wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-		wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
-		set_normalized_timespec(&xtime, sec, nsec);
-		set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-		time_adjust = 0;		/* stop active adjtime() */
-		time_status |= STA_UNSYNC;
-		time_maxerror = NTP_PHASE_LIMIT;
-		time_esterror = NTP_PHASE_LIMIT;
-		time_interpolator_reset();
-	}
-	write_sequnlock_irq(&xtime_lock);
-	clock_was_set();
-	return 0;
-}
-EXPORT_SYMBOL(do_settimeofday);
-
-void do_gettimeofday (struct timeval *tv)
-{
-	unsigned long seq, nsec, usec, sec, offset;
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		offset = time_interpolator_get_offset();
-		sec = xtime.tv_sec;
-		nsec = xtime.tv_nsec;
-	} while (unlikely(read_seqretry(&xtime_lock, seq)));
-
-	usec = (nsec + offset) / 1000;
-
-	while (unlikely(usec >= USEC_PER_SEC)) {
-		usec -= USEC_PER_SEC;
-		++sec;
-	}
-
-	tv->tv_sec = sec;
-	tv->tv_usec = usec;
-
-	/*
-	 * Make sure xtime.tv_sec [returned by sys_time()] always
-	 * follows the gettimeofday() result precisely. This
-	 * condition is extremely unlikely, it can hit at most
-	 * once per second:
-	 */
-	if (unlikely(xtime.tv_sec != tv->tv_sec)) {
-		unsigned long flags;
-
-		write_seqlock_irqsave(&xtime_lock, flags);
-		update_wall_time();
-		write_sequnlock_irqrestore(&xtime_lock, flags);
-	}
-}
-EXPORT_SYMBOL(do_gettimeofday);
-
-#else	/* CONFIG_TIME_INTERPOLATION */
-
 #ifndef CONFIG_GENERIC_TIME
 /*
  * Simulate gettimeofday using do_gettimeofday which only allows a timeval
@@ -410,7 +323,6 @@ void getnstimeofday(struct timespec *tv)
 }
 EXPORT_SYMBOL_GPL(getnstimeofday);
 #endif
-#endif	/* CONFIG_TIME_INTERPOLATION */
 
 /* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
  * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 438c6b723ee2..b5e352597cbb 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -116,11 +116,6 @@ void second_overflow(void)
 		if (xtime.tv_sec % 86400 == 0) {
 			xtime.tv_sec--;
 			wall_to_monotonic.tv_sec++;
-			/*
-			 * The timer interpolator will make time change
-			 * gradually instead of an immediate jump by one second
-			 */
-			time_interpolator_update(-NSEC_PER_SEC);
 			time_state = TIME_OOP;
 			printk(KERN_NOTICE "Clock: inserting leap second "
 					"23:59:60 UTC\n");
@@ -130,11 +125,6 @@ void second_overflow(void)
 		if ((xtime.tv_sec + 1) % 86400 == 0) {
 			xtime.tv_sec++;
 			wall_to_monotonic.tv_sec--;
-			/*
-			 * Use of time interpolator for a gradual change of
-			 * time
-			 */
-			time_interpolator_update(NSEC_PER_SEC);
 			time_state = TIME_WAIT;
 			printk(KERN_NOTICE "Clock: deleting leap second "
 					"23:59:59 UTC\n");
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 728cedfd3cbd..027d46c906e0 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -466,10 +466,6 @@ void update_wall_time(void)
 			second_overflow();
 		}
 
-		/* interpolator bits */
-		time_interpolator_update(clock->xtime_interval
-						>> clock->shift);
-
 		/* accumulate error between NTP and clock interval */
 		clock->error += current_tick_length();
 		clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
diff --git a/kernel/timer.c b/kernel/timer.c
index b7792fb03387..dbc03ab14eed 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1349,194 +1349,6 @@ void __init init_timers(void)
 	open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
 }
 
-#ifdef CONFIG_TIME_INTERPOLATION
-
-struct time_interpolator *time_interpolator __read_mostly;
-static struct time_interpolator *time_interpolator_list __read_mostly;
-static DEFINE_SPINLOCK(time_interpolator_lock);
-
-static inline cycles_t time_interpolator_get_cycles(unsigned int src)
-{
-	unsigned long (*x)(void);
-
-	switch (src)
-	{
-		case TIME_SOURCE_FUNCTION:
-			x = time_interpolator->addr;
-			return x();
-
-		case TIME_SOURCE_MMIO64	:
-			return readq_relaxed((void __iomem *)time_interpolator->addr);
-
-		case TIME_SOURCE_MMIO32	:
-			return readl_relaxed((void __iomem *)time_interpolator->addr);
-
-		default: return get_cycles();
-	}
-}
-
-static inline u64 time_interpolator_get_counter(int writelock)
-{
-	unsigned int src = time_interpolator->source;
-
-	if (time_interpolator->jitter)
-	{
-		cycles_t lcycle;
-		cycles_t now;
-
-		do {
-			lcycle = time_interpolator->last_cycle;
-			now = time_interpolator_get_cycles(src);
-			if (lcycle && time_after(lcycle, now))
-				return lcycle;
-
-			/* When holding the xtime write lock, there's no need
-			 * to add the overhead of the cmpxchg.  Readers are
-			 * force to retry until the write lock is released.
-			 */
-			if (writelock) {
-				time_interpolator->last_cycle = now;
-				return now;
-			}
-			/* Keep track of the last timer value returned. The use of cmpxchg here
-			 * will cause contention in an SMP environment.
-			 */
-		} while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle));
-		return now;
-	}
-	else
-		return time_interpolator_get_cycles(src);
-}
-
-void time_interpolator_reset(void)
-{
-	time_interpolator->offset = 0;
-	time_interpolator->last_counter = time_interpolator_get_counter(1);
-}
-
-#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
-
-unsigned long time_interpolator_get_offset(void)
-{
-	/* If we do not have a time interpolator set up then just return zero */
-	if (!time_interpolator)
-		return 0;
-
-	return time_interpolator->offset +
-		GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
-}
-
-#define INTERPOLATOR_ADJUST 65536
-#define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
-
-void time_interpolator_update(long delta_nsec)
-{
-	u64 counter;
-	unsigned long offset;
-
-	/* If there is no time interpolator set up then do nothing */
-	if (!time_interpolator)
-		return;
-
-	/*
-	 * The interpolator compensates for late ticks by accumulating the late
-	 * time in time_interpolator->offset. A tick earlier than expected will
-	 * lead to a reset of the offset and a corresponding jump of the clock
-	 * forward. Again this only works if the interpolator clock is running
-	 * slightly slower than the regular clock and the tuning logic insures
-	 * that.
-	 */
-
-	counter = time_interpolator_get_counter(1);
-	offset = time_interpolator->offset +
-			GET_TI_NSECS(counter, time_interpolator);
-
-	if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
-		time_interpolator->offset = offset - delta_nsec;
-	else {
-		time_interpolator->skips++;
-		time_interpolator->ns_skipped += delta_nsec - offset;
-		time_interpolator->offset = 0;
-	}
-	time_interpolator->last_counter = counter;
-
-	/* Tuning logic for time interpolator invoked every minute or so.
-	 * Decrease interpolator clock speed if no skips occurred and an offset is carried.
-	 * Increase interpolator clock speed if we skip too much time.
-	 */
-	if (jiffies % INTERPOLATOR_ADJUST == 0)
-	{
-		if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec)
-			time_interpolator->nsec_per_cyc--;
-		if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
-			time_interpolator->nsec_per_cyc++;
-		time_interpolator->skips = 0;
-		time_interpolator->ns_skipped = 0;
-	}
-}
-
-static inline int
-is_better_time_interpolator(struct time_interpolator *new)
-{
-	if (!time_interpolator)
-		return 1;
-	return new->frequency > 2*time_interpolator->frequency ||
-	    (unsigned long)new->drift < (unsigned long)time_interpolator->drift;
-}
-
-void
-register_time_interpolator(struct time_interpolator *ti)
-{
-	unsigned long flags;
-
-	/* Sanity check */
-	BUG_ON(ti->frequency == 0 || ti->mask == 0);
-
-	ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
-	spin_lock(&time_interpolator_lock);
-	write_seqlock_irqsave(&xtime_lock, flags);
-	if (is_better_time_interpolator(ti)) {
-		time_interpolator = ti;
-		time_interpolator_reset();
-	}
-	write_sequnlock_irqrestore(&xtime_lock, flags);
-
-	ti->next = time_interpolator_list;
-	time_interpolator_list = ti;
-	spin_unlock(&time_interpolator_lock);
-}
-
-void
-unregister_time_interpolator(struct time_interpolator *ti)
-{
-	struct time_interpolator *curr, **prev;
-	unsigned long flags;
-
-	spin_lock(&time_interpolator_lock);
-	prev = &time_interpolator_list;
-	for (curr = *prev; curr; curr = curr->next) {
-		if (curr == ti) {
-			*prev = curr->next;
-			break;
-		}
-		prev = &curr->next;
-	}
-
-	write_seqlock_irqsave(&xtime_lock, flags);
-	if (ti == time_interpolator) {
-		/* we lost the best time-interpolator: */
-		time_interpolator = NULL;
-		/* find the next-best interpolator */
-		for (curr = time_interpolator_list; curr; curr = curr->next)
-			if (is_better_time_interpolator(curr))
-				time_interpolator = curr;
-		time_interpolator_reset();
-	}
-	write_sequnlock_irqrestore(&xtime_lock, flags);
-	spin_unlock(&time_interpolator_lock);
-}
-#endif /* CONFIG_TIME_INTERPOLATION */
-
 /**
  * msleep - sleep safely even with waitqueue interruptions
  * @msecs: Time in milliseconds to sleep for
-- 
cgit v1.3-8-gc7d7


From 1d4ec7b1d6f130818f9b62dea3411d9ee2ff6ff6 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rdreier@cisco.com>
Date: Fri, 20 Jul 2007 12:13:20 -0700
Subject: Fix ZERO_OR_NULL_PTR(ZERO_SIZE_PTR)

The comparison with ZERO_SIZE_PTR in ZERO_OR_NULL_PTR() needs to be <=
(not just <) so that ZERO_OR_NULL_PTR(ZERO_SIZE_PTR) is 1.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
[ Duh!  - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 7d0ecc1659f0..d859354b9e51 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -40,7 +40,7 @@
  */
 #define ZERO_SIZE_PTR ((void *)16)
 
-#define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) < \
+#define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \
 				(unsigned long)ZERO_SIZE_PTR)
 
 /*
-- 
cgit v1.3-8-gc7d7


From 8e68e2f248332a9c3fd4f08258f488c209bd3e0c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 20 Jul 2007 21:39:47 +0200
Subject: [CELL] spufs: extension of spu_create to support affinity definition

This patch adds support for additional flags at spu_create, which relate
to the establishment of affinity between contexts and contexts to memory.
A fourth, optional, parameter is supported. This parameter represent
a affinity neighbor of the context being created, and is used when defining
SPU-SPU affinity.
Affinity is represented as a doubly linked list of spu_contexts.

Signed-off-by: Andre Detsch <adetsch@br.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
---
 arch/powerpc/platforms/cell/spu_syscalls.c   |  17 +++-
 arch/powerpc/platforms/cell/spufs/context.c  |   1 +
 arch/powerpc/platforms/cell/spufs/gang.c     |   4 +
 arch/powerpc/platforms/cell/spufs/inode.c    | 132 +++++++++++++++++++++++++--
 arch/powerpc/platforms/cell/spufs/spufs.h    |  16 +++-
 arch/powerpc/platforms/cell/spufs/syscalls.c |  32 ++++++-
 include/asm-powerpc/spu.h                    |   8 +-
 include/linux/syscalls.h                     |   2 +-
 8 files changed, 195 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/cell/spu_syscalls.c b/arch/powerpc/platforms/cell/spu_syscalls.c
index 261b507a901a..dd2c6688c8aa 100644
--- a/arch/powerpc/platforms/cell/spu_syscalls.c
+++ b/arch/powerpc/platforms/cell/spu_syscalls.c
@@ -34,14 +34,27 @@ struct spufs_calls spufs_calls = {
  * this file is not used and the syscalls directly enter the fs code */
 
 asmlinkage long sys_spu_create(const char __user *name,
-		unsigned int flags, mode_t mode)
+		unsigned int flags, mode_t mode, int neighbor_fd)
 {
 	long ret;
 	struct module *owner = spufs_calls.owner;
+	struct file *neighbor;
+	int fput_needed;
 
 	ret = -ENOSYS;
 	if (owner && try_module_get(owner)) {
-		ret = spufs_calls.create_thread(name, flags, mode);
+		if (flags & SPU_CREATE_AFFINITY_SPU) {
+			neighbor = fget_light(neighbor_fd, &fput_needed);
+			if (neighbor) {
+				ret = spufs_calls.create_thread(name, flags,
+								mode, neighbor);
+				fput_light(neighbor, fput_needed);
+			}
+		}
+		else {
+			ret = spufs_calls.create_thread(name, flags,
+							mode, NULL);
+		}
 		module_put(owner);
 	}
 	return ret;
diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c
index 6b091ea1d192..a7efb999d65e 100644
--- a/arch/powerpc/platforms/cell/spufs/context.c
+++ b/arch/powerpc/platforms/cell/spufs/context.c
@@ -55,6 +55,7 @@ struct spu_context *alloc_spu_context(struct spu_gang *gang)
 	ctx->ops = &spu_backing_ops;
 	ctx->owner = get_task_mm(current);
 	INIT_LIST_HEAD(&ctx->rq);
+	INIT_LIST_HEAD(&ctx->aff_list);
 	if (gang)
 		spu_gang_add_ctx(gang, ctx);
 	ctx->cpus_allowed = current->cpus_allowed;
diff --git a/arch/powerpc/platforms/cell/spufs/gang.c b/arch/powerpc/platforms/cell/spufs/gang.c
index 212ea78f9051..0a752ce67c8a 100644
--- a/arch/powerpc/platforms/cell/spufs/gang.c
+++ b/arch/powerpc/platforms/cell/spufs/gang.c
@@ -35,7 +35,9 @@ struct spu_gang *alloc_spu_gang(void)
 
 	kref_init(&gang->kref);
 	mutex_init(&gang->mutex);
+	mutex_init(&gang->aff_mutex);
 	INIT_LIST_HEAD(&gang->list);
+	INIT_LIST_HEAD(&gang->aff_list_head);
 
 out:
 	return gang;
@@ -73,6 +75,8 @@ void spu_gang_remove_ctx(struct spu_gang *gang, struct spu_context *ctx)
 {
 	mutex_lock(&gang->mutex);
 	WARN_ON(ctx->gang != gang);
+	if (!list_empty(&ctx->aff_list))
+		list_del_init(&ctx->aff_list);
 	list_del_init(&ctx->gang_list);
 	gang->contexts--;
 	mutex_unlock(&gang->mutex);
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 7eb4d6cbcb74..b3d0dd118dd0 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -316,11 +316,107 @@ out:
 	return ret;
 }
 
-static int spufs_create_context(struct inode *inode,
-			struct dentry *dentry,
-			struct vfsmount *mnt, int flags, int mode)
+static struct spu_context *
+spufs_assert_affinity(unsigned int flags, struct spu_gang *gang,
+						struct file *filp)
+{
+	struct spu_context *tmp, *neighbor;
+	int count, node;
+	int aff_supp;
+
+	aff_supp = !list_empty(&(list_entry(cbe_spu_info[0].spus.next,
+					struct spu, cbe_list))->aff_list);
+
+	if (!aff_supp)
+		return ERR_PTR(-EINVAL);
+
+	if (flags & SPU_CREATE_GANG)
+		return ERR_PTR(-EINVAL);
+
+	if (flags & SPU_CREATE_AFFINITY_MEM &&
+	    gang->aff_ref_ctx &&
+	    gang->aff_ref_ctx->flags & SPU_CREATE_AFFINITY_MEM)
+		return ERR_PTR(-EEXIST);
+
+	if (gang->aff_flags & AFF_MERGED)
+		return ERR_PTR(-EBUSY);
+
+	neighbor = NULL;
+	if (flags & SPU_CREATE_AFFINITY_SPU) {
+		if (!filp || filp->f_op != &spufs_context_fops)
+			return ERR_PTR(-EINVAL);
+
+		neighbor = get_spu_context(
+				SPUFS_I(filp->f_dentry->d_inode)->i_ctx);
+
+		if (!list_empty(&neighbor->aff_list) && !(neighbor->aff_head) &&
+		    !list_is_last(&neighbor->aff_list, &gang->aff_list_head) &&
+		    !list_entry(neighbor->aff_list.next, struct spu_context,
+		    aff_list)->aff_head)
+			return ERR_PTR(-EEXIST);
+
+		if (gang != neighbor->gang)
+			return ERR_PTR(-EINVAL);
+
+		count = 1;
+		list_for_each_entry(tmp, &gang->aff_list_head, aff_list)
+			count++;
+		if (list_empty(&neighbor->aff_list))
+			count++;
+
+		for (node = 0; node < MAX_NUMNODES; node++) {
+			if ((cbe_spu_info[node].n_spus - atomic_read(
+				&cbe_spu_info[node].reserved_spus)) >= count)
+				break;
+		}
+
+		if (node == MAX_NUMNODES)
+			return ERR_PTR(-EEXIST);
+	}
+
+	return neighbor;
+}
+
+static void
+spufs_set_affinity(unsigned int flags, struct spu_context *ctx,
+					struct spu_context *neighbor)
+{
+	if (flags & SPU_CREATE_AFFINITY_MEM)
+		ctx->gang->aff_ref_ctx = ctx;
+
+	if (flags & SPU_CREATE_AFFINITY_SPU) {
+		if (list_empty(&neighbor->aff_list)) {
+			list_add_tail(&neighbor->aff_list,
+				&ctx->gang->aff_list_head);
+			neighbor->aff_head = 1;
+		}
+
+		if (list_is_last(&neighbor->aff_list, &ctx->gang->aff_list_head)
+		    || list_entry(neighbor->aff_list.next, struct spu_context,
+							aff_list)->aff_head) {
+			list_add(&ctx->aff_list, &neighbor->aff_list);
+		} else  {
+			list_add_tail(&ctx->aff_list, &neighbor->aff_list);
+			if (neighbor->aff_head) {
+				neighbor->aff_head = 0;
+				ctx->aff_head = 1;
+			}
+		}
+
+		if (!ctx->gang->aff_ref_ctx)
+			ctx->gang->aff_ref_ctx = ctx;
+	}
+}
+
+static int
+spufs_create_context(struct inode *inode, struct dentry *dentry,
+			struct vfsmount *mnt, int flags, int mode,
+			struct file *aff_filp)
 {
 	int ret;
+	int affinity;
+	struct spu_gang *gang;
+	struct spu_context *neighbor;
 
 	ret = -EPERM;
 	if ((flags & SPU_CREATE_NOSCHED) &&
@@ -336,9 +432,29 @@ static int spufs_create_context(struct inode *inode,
 	if ((flags & SPU_CREATE_ISOLATE) && !isolated_loader)
 		goto out_unlock;
 
+	gang = NULL;
+	neighbor = NULL;
+	affinity = flags & (SPU_CREATE_AFFINITY_MEM | SPU_CREATE_AFFINITY_SPU);
+	if (affinity) {
+		gang = SPUFS_I(inode)->i_gang;
+		ret = -EINVAL;
+		if (!gang)
+			goto out_unlock;
+		mutex_lock(&gang->aff_mutex);
+		neighbor = spufs_assert_affinity(flags, gang, aff_filp);
+		if (IS_ERR(neighbor)) {
+			ret = PTR_ERR(neighbor);
+			goto out_aff_unlock;
+		}
+	}
+
 	ret = spufs_mkdir(inode, dentry, flags, mode & S_IRWXUGO);
 	if (ret)
-		goto out_unlock;
+		goto out_aff_unlock;
+
+	if (affinity)
+		spufs_set_affinity(flags, SPUFS_I(dentry->d_inode)->i_ctx,
+								neighbor);
 
 	/*
 	 * get references for dget and mntget, will be released
@@ -352,6 +468,9 @@ static int spufs_create_context(struct inode *inode,
 		goto out;
 	}
 
+out_aff_unlock:
+	if (affinity)
+		mutex_unlock(&gang->aff_mutex);
 out_unlock:
 	mutex_unlock(&inode->i_mutex);
 out:
@@ -450,7 +569,8 @@ out:
 
 static struct file_system_type spufs_type;
 
-long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode)
+long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode,
+							struct file *filp)
 {
 	struct dentry *dentry;
 	int ret;
@@ -487,7 +607,7 @@ long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode)
 					dentry, nd->mnt, mode);
 	else
 		return spufs_create_context(nd->dentry->d_inode,
-					dentry, nd->mnt, flags, mode);
+					dentry, nd->mnt, flags, mode, filp);
 
 out_dput:
 	dput(dentry);
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index 03e8315f6f9e..36da17987e9c 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -109,6 +109,9 @@ struct spu_context {
 		unsigned long long class2_intr_base; /* # at last ctx switch */
 		unsigned long long libassist;
 	} stats;
+
+	struct list_head aff_list;
+	int aff_head;
 };
 
 struct spu_gang {
@@ -116,8 +119,17 @@ struct spu_gang {
 	struct mutex mutex;
 	struct kref kref;
 	int contexts;
+
+	struct spu_context *aff_ref_ctx;
+	struct list_head aff_list_head;
+	struct mutex aff_mutex;
+	int aff_flags;
 };
 
+/* Flag bits for spu_gang aff_flags */
+#define AFF_OFFSETS_SET		1
+#define AFF_MERGED		2
+
 struct mfc_dma_command {
 	int32_t pad;	/* reserved */
 	uint32_t lsa;	/* local storage address */
@@ -182,8 +194,8 @@ extern struct tree_descr spufs_dir_nosched_contents[];
 
 /* system call implementation */
 long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *status);
-long spufs_create(struct nameidata *nd,
-			 unsigned int flags, mode_t mode);
+long spufs_create(struct nameidata *nd, unsigned int flags,
+			mode_t mode, struct file *filp);
 extern const struct file_operations spufs_context_fops;
 
 /* gang management */
diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c
index 13a383c67cae..43f0fb88abbc 100644
--- a/arch/powerpc/platforms/cell/spufs/syscalls.c
+++ b/arch/powerpc/platforms/cell/spufs/syscalls.c
@@ -76,8 +76,8 @@ asmlinkage long sys_spu_run(int fd, __u32 __user *unpc, __u32 __user *ustatus)
 }
 #endif
 
-asmlinkage long sys_spu_create(const char __user *pathname,
-					unsigned int flags, mode_t mode)
+asmlinkage long do_spu_create(const char __user *pathname, unsigned int flags,
+				mode_t mode, struct file *neighbor)
 {
 	char *tmp;
 	int ret;
@@ -90,7 +90,7 @@ asmlinkage long sys_spu_create(const char __user *pathname,
 		ret = path_lookup(tmp, LOOKUP_PARENT|
 				LOOKUP_OPEN|LOOKUP_CREATE, &nd);
 		if (!ret) {
-			ret = spufs_create(&nd, flags, mode);
+			ret = spufs_create(&nd, flags, mode, neighbor);
 			path_release(&nd);
 		}
 		putname(tmp);
@@ -99,8 +99,32 @@ asmlinkage long sys_spu_create(const char __user *pathname,
 	return ret;
 }
 
+#ifndef MODULE
+asmlinkage long sys_spu_create(const char __user *pathname, unsigned int flags,
+				mode_t mode, int neighbor_fd)
+{
+	int fput_needed;
+	struct file *neighbor;
+	long ret;
+
+	if (flags & SPU_CREATE_AFFINITY_SPU) {
+		ret = -EBADF;
+		neighbor = fget_light(neighbor_fd, &fput_needed);
+		if (neighbor) {
+			ret = do_spu_create(pathname, flags, mode, neighbor);
+			fput_light(neighbor, fput_needed);
+		}
+	}
+	else {
+		ret = do_spu_create(pathname, flags, mode, NULL);
+	}
+
+	return ret;
+}
+#endif
+
 struct spufs_calls spufs_calls = {
-	.create_thread = sys_spu_create,
+	.create_thread = do_spu_create,
 	.spu_run = do_spu_run,
 	.owner = THIS_MODULE,
 };
diff --git a/include/asm-powerpc/spu.h b/include/asm-powerpc/spu.h
index 18e558bef98e..24f352da2869 100644
--- a/include/asm-powerpc/spu.h
+++ b/include/asm-powerpc/spu.h
@@ -196,6 +196,7 @@ extern struct cbe_spu_info cbe_spu_info[];
 
 struct spu *spu_alloc(void);
 struct spu *spu_alloc_node(int node);
+struct spu *spu_alloc_spu(struct spu *spu);
 void spu_free(struct spu *spu);
 int spu_irq_class_0_bottom(struct spu *spu);
 int spu_irq_class_1_bottom(struct spu *spu);
@@ -227,7 +228,8 @@ extern long spu_sys_callback(struct spu_syscall_block *s);
 struct file;
 extern struct spufs_calls {
 	asmlinkage long (*create_thread)(const char __user *name,
-					unsigned int flags, mode_t mode);
+					unsigned int flags, mode_t mode,
+					struct file *neighbor);
 	asmlinkage long (*spu_run)(struct file *filp, __u32 __user *unpc,
 						__u32 __user *ustatus);
 	struct module *owner;
@@ -254,8 +256,10 @@ struct spu_coredump_calls {
 #define SPU_CREATE_GANG			0x0002
 #define SPU_CREATE_NOSCHED		0x0004
 #define SPU_CREATE_ISOLATE		0x0008
+#define SPU_CREATE_AFFINITY_SPU		0x0010
+#define SPU_CREATE_AFFINITY_MEM		0x0020
 
-#define SPU_CREATE_FLAG_ALL		0x000f /* mask of all valid flags */
+#define SPU_CREATE_FLAG_ALL		0x003f /* mask of all valid flags */
 
 
 #ifdef CONFIG_SPU_FS_MODULE
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7a8b1e3322e0..61def7c8fbb3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -549,7 +549,7 @@ asmlinkage long sys_inotify_rm_watch(int fd, u32 wd);
 asmlinkage long sys_spu_run(int fd, __u32 __user *unpc,
 				 __u32 __user *ustatus);
 asmlinkage long sys_spu_create(const char __user *name,
-		unsigned int flags, mode_t mode);
+		unsigned int flags, mode_t mode, int fd);
 
 asmlinkage long sys_mknodat(int dfd, const char __user * filename, int mode,
 			    unsigned dev);
-- 
cgit v1.3-8-gc7d7


From 1474855d0878cced6f39f51f3c2bd7428b44cb1e Mon Sep 17 00:00:00 2001
From: Bob Nelson <rrnelson@linux.vnet.ibm.com>
Date: Fri, 20 Jul 2007 21:39:53 +0200
Subject: [CELL] oprofile: add support to OProfile for profiling CELL BE SPUs

From: Maynard Johnson <mpjohn@us.ibm.com>

This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling code.
Exports spu_set_profile_private_kref and spu_get_profile_private_kref which
are used by OProfile to store private profile information in spufs data
structures.

Also incorporated several fixes from other patches (rrn).  Check pointer
returned from kzalloc.  Eliminated unnecessary cast.  Better error
handling and cleanup in the related area.  64-bit unsigned long parameter
was being demoted to 32-bit unsigned int and eventually promoted back to
unsigned long.

Signed-off-by: Carl Love <carll@us.ibm.com>
Signed-off-by: Maynard Johnson <mpjohn@us.ibm.com>
Signed-off-by: Bob Nelson <rrnelson@us.ibm.com>
Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/configs/cell_defconfig         |   3 +-
 arch/powerpc/kernel/time.c                  |   1 +
 arch/powerpc/oprofile/Kconfig               |   7 +
 arch/powerpc/oprofile/Makefile              |   4 +-
 arch/powerpc/oprofile/cell/pr_util.h        |  97 +++++
 arch/powerpc/oprofile/cell/spu_profiler.c   | 221 ++++++++++
 arch/powerpc/oprofile/cell/spu_task_sync.c  | 484 ++++++++++++++++++++++
 arch/powerpc/oprofile/cell/vma_map.c        | 287 +++++++++++++
 arch/powerpc/oprofile/common.c              |  51 ++-
 arch/powerpc/oprofile/op_model_7450.c       |  14 +-
 arch/powerpc/oprofile/op_model_cell.c       | 607 ++++++++++++++++++++++++----
 arch/powerpc/oprofile/op_model_fsl_booke.c  |  11 +-
 arch/powerpc/oprofile/op_model_pa6t.c       |  12 +-
 arch/powerpc/oprofile/op_model_power4.c     |  11 +-
 arch/powerpc/oprofile/op_model_rs64.c       |  10 +-
 arch/powerpc/platforms/cell/spufs/context.c |  20 +
 arch/powerpc/platforms/cell/spufs/sched.c   |   4 +-
 arch/powerpc/platforms/cell/spufs/spufs.h   |   2 +
 drivers/oprofile/buffer_sync.c              |   3 +-
 drivers/oprofile/event_buffer.h             |  20 +-
 drivers/oprofile/oprof.c                    |  28 ++
 include/asm-powerpc/oprofile_impl.h         |  10 +-
 include/asm-powerpc/spu.h                   |  15 +
 include/linux/dcookies.h                    |   1 +
 include/linux/elf-em.h                      |   3 +-
 include/linux/oprofile.h                    |  35 ++
 26 files changed, 1828 insertions(+), 133 deletions(-)
 create mode 100644 arch/powerpc/oprofile/cell/pr_util.h
 create mode 100644 arch/powerpc/oprofile/cell/spu_profiler.c
 create mode 100644 arch/powerpc/oprofile/cell/spu_task_sync.c
 create mode 100644 arch/powerpc/oprofile/cell/vma_map.c

(limited to 'include/linux')

diff --git a/arch/powerpc/configs/cell_defconfig b/arch/powerpc/configs/cell_defconfig
index 74f83f4a4e5e..d9ac24e8de16 100644
--- a/arch/powerpc/configs/cell_defconfig
+++ b/arch/powerpc/configs/cell_defconfig
@@ -1455,7 +1455,8 @@ CONFIG_HAS_DMA=y
 # Instrumentation Support
 #
 CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
+CONFIG_OPROFILE_CELL=y
 # CONFIG_KPROBES is not set
 
 #
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index e5df167f7824..727a6699f2f4 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -122,6 +122,7 @@ extern struct timezone sys_tz;
 static long timezone_offset;
 
 unsigned long ppc_proc_freq;
+EXPORT_SYMBOL(ppc_proc_freq);
 unsigned long ppc_tb_freq;
 
 static u64 tb_last_jiffy __cacheline_aligned_in_smp;
diff --git a/arch/powerpc/oprofile/Kconfig b/arch/powerpc/oprofile/Kconfig
index eb2dece76a54..7089e79689b9 100644
--- a/arch/powerpc/oprofile/Kconfig
+++ b/arch/powerpc/oprofile/Kconfig
@@ -15,3 +15,10 @@ config OPROFILE
 
 	  If unsure, say N.
 
+config OPROFILE_CELL
+	bool "OProfile for Cell Broadband Engine"
+	depends on (SPU_FS = y && OPROFILE = m) || (SPU_FS = y && OPROFILE = y) || (SPU_FS = m && OPROFILE = m)
+	default y
+	help
+	  Profiling of Cell BE SPUs requires special support enabled
+	  by this option.
diff --git a/arch/powerpc/oprofile/Makefile b/arch/powerpc/oprofile/Makefile
index 4b5f9528218c..c5f64c3bd668 100644
--- a/arch/powerpc/oprofile/Makefile
+++ b/arch/powerpc/oprofile/Makefile
@@ -11,7 +11,9 @@ DRIVER_OBJS := $(addprefix ../../../drivers/oprofile/, \
 		timer_int.o )
 
 oprofile-y := $(DRIVER_OBJS) common.o backtrace.o
-oprofile-$(CONFIG_PPC_CELL_NATIVE) += op_model_cell.o
+oprofile-$(CONFIG_OPROFILE_CELL) += op_model_cell.o \
+		cell/spu_profiler.o cell/vma_map.o \
+		cell/spu_task_sync.o
 oprofile-$(CONFIG_PPC64) += op_model_rs64.o op_model_power4.o op_model_pa6t.o
 oprofile-$(CONFIG_FSL_BOOKE) += op_model_fsl_booke.o
 oprofile-$(CONFIG_6xx) += op_model_7450.o
diff --git a/arch/powerpc/oprofile/cell/pr_util.h b/arch/powerpc/oprofile/cell/pr_util.h
new file mode 100644
index 000000000000..e5704f00c8b4
--- /dev/null
+++ b/arch/powerpc/oprofile/cell/pr_util.h
@@ -0,0 +1,97 @@
+ /*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef PR_UTIL_H
+#define PR_UTIL_H
+
+#include <linux/cpumask.h>
+#include <linux/oprofile.h>
+#include <asm/cell-pmu.h>
+#include <asm/spu.h>
+
+#include "../../platforms/cell/cbe_regs.h"
+
+/* Defines used for sync_start */
+#define SKIP_GENERIC_SYNC 0
+#define SYNC_START_ERROR -1
+#define DO_GENERIC_SYNC 1
+
+struct spu_overlay_info {	/* map of sections within an SPU overlay */
+	unsigned int vma;	/* SPU virtual memory address from elf */
+	unsigned int size;	/* size of section from elf */
+	unsigned int offset;	/* offset of section into elf file */
+	unsigned int buf;
+};
+
+struct vma_to_fileoffset_map {	/* map of sections within an SPU program */
+	struct vma_to_fileoffset_map *next;	/* list pointer */
+	unsigned int vma;	/* SPU virtual memory address from elf */
+	unsigned int size;	/* size of section from elf */
+	unsigned int offset;	/* offset of section into elf file */
+	unsigned int guard_ptr;
+	unsigned int guard_val;
+        /*
+	 * The guard pointer is an entry in the _ovly_buf_table,
+	 * computed using ovly.buf as the index into the table.  Since
+	 * ovly.buf values begin at '1' to reference the first (or 0th)
+	 * entry in the _ovly_buf_table, the computation subtracts 1
+	 * from ovly.buf.
+	 * The guard value is stored in the _ovly_buf_table entry and
+	 * is an index (starting at 1) back to the _ovly_table entry
+	 * that is pointing at this _ovly_buf_table entry.  So, for
+	 * example, for an overlay scenario with one overlay segment
+	 * and two overlay sections:
+	 *      - Section 1 points to the first entry of the
+	 *        _ovly_buf_table, which contains a guard value
+	 *        of '1', referencing the first (index=0) entry of
+	 *        _ovly_table.
+	 *      - Section 2 points to the second entry of the
+	 *        _ovly_buf_table, which contains a guard value
+	 *        of '2', referencing the second (index=1) entry of
+	 *        _ovly_table.
+	 */
+
+};
+
+/* The three functions below are for maintaining and accessing
+ * the vma-to-fileoffset map.
+ */
+struct vma_to_fileoffset_map *create_vma_map(const struct spu *spu,
+					     u64 objectid);
+unsigned int vma_map_lookup(struct vma_to_fileoffset_map *map,
+			    unsigned int vma, const struct spu *aSpu,
+			    int *grd_val);
+void vma_map_free(struct vma_to_fileoffset_map *map);
+
+/*
+ * Entry point for SPU profiling.
+ * cycles_reset is the SPU_CYCLES count value specified by the user.
+ */
+int start_spu_profiling(unsigned int cycles_reset);
+
+void stop_spu_profiling(void);
+
+
+/* add the necessary profiling hooks */
+int spu_sync_start(void);
+
+/* remove the hooks */
+int spu_sync_stop(void);
+
+/* Record SPU program counter samples to the oprofile event buffer. */
+void spu_sync_buffer(int spu_num, unsigned int *samples,
+		     int num_samples);
+
+void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset);
+
+#endif	  /* PR_UTIL_H */
diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c
new file mode 100644
index 000000000000..380d7e217531
--- /dev/null
+++ b/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -0,0 +1,221 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Authors: Maynard Johnson <maynardj@us.ibm.com>
+ *	    Carl Love <carll@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/hrtimer.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <asm/cell-pmu.h>
+#include "pr_util.h"
+
+#define TRACE_ARRAY_SIZE 1024
+#define SCALE_SHIFT 14
+
+static u32 *samples;
+
+static int spu_prof_running;
+static unsigned int profiling_interval;
+
+#define NUM_SPU_BITS_TRBUF 16
+#define SPUS_PER_TB_ENTRY   4
+#define SPUS_PER_NODE	     8
+
+#define SPU_PC_MASK	     0xFFFF
+
+static DEFINE_SPINLOCK(sample_array_lock);
+unsigned long sample_array_lock_flags;
+
+void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset)
+{
+	unsigned long ns_per_cyc;
+
+	if (!freq_khz)
+		freq_khz = ppc_proc_freq/1000;
+
+	/* To calculate a timeout in nanoseconds, the basic
+	 * formula is ns = cycles_reset * (NSEC_PER_SEC / cpu frequency).
+	 * To avoid floating point math, we use the scale math
+	 * technique as described in linux/jiffies.h.  We use
+	 * a scale factor of SCALE_SHIFT, which provides 4 decimal places
+	 * of precision.  This is close enough for the purpose at hand.
+	 *
+	 * The value of the timeout should be small enough that the hw
+	 * trace buffer will not get more then about 1/3 full for the
+	 * maximum user specified (the LFSR value) hw sampling frequency.
+	 * This is to ensure the trace buffer will never fill even if the
+	 * kernel thread scheduling varies under a heavy system load.
+	 */
+
+	ns_per_cyc = (USEC_PER_SEC << SCALE_SHIFT)/freq_khz;
+	profiling_interval = (ns_per_cyc * cycles_reset) >> SCALE_SHIFT;
+
+}
+
+/*
+ * Extract SPU PC from trace buffer entry
+ */
+static void spu_pc_extract(int cpu, int entry)
+{
+	/* the trace buffer is 128 bits */
+	u64 trace_buffer[2];
+	u64 spu_mask;
+	int spu;
+
+	spu_mask = SPU_PC_MASK;
+
+	/* Each SPU PC is 16 bits; hence, four spus in each of
+	 * the two 64-bit buffer entries that make up the
+	 * 128-bit trace_buffer entry.	Process two 64-bit values
+	 * simultaneously.
+	 * trace[0] SPU PC contents are: 0 1 2 3
+	 * trace[1] SPU PC contents are: 4 5 6 7
+	 */
+
+	cbe_read_trace_buffer(cpu, trace_buffer);
+
+	for (spu = SPUS_PER_TB_ENTRY-1; spu >= 0; spu--) {
+		/* spu PC trace entry is upper 16 bits of the
+		 * 18 bit SPU program counter
+		 */
+		samples[spu * TRACE_ARRAY_SIZE + entry]
+			= (spu_mask & trace_buffer[0]) << 2;
+		samples[(spu + SPUS_PER_TB_ENTRY) * TRACE_ARRAY_SIZE + entry]
+			= (spu_mask & trace_buffer[1]) << 2;
+
+		trace_buffer[0] = trace_buffer[0] >> NUM_SPU_BITS_TRBUF;
+		trace_buffer[1] = trace_buffer[1] >> NUM_SPU_BITS_TRBUF;
+	}
+}
+
+static int cell_spu_pc_collection(int cpu)
+{
+	u32 trace_addr;
+	int entry;
+
+	/* process the collected SPU PC for the node */
+
+	entry = 0;
+
+	trace_addr = cbe_read_pm(cpu, trace_address);
+	while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY)) {
+		/* there is data in the trace buffer to process */
+		spu_pc_extract(cpu, entry);
+
+		entry++;
+
+		if (entry >= TRACE_ARRAY_SIZE)
+			/* spu_samples is full */
+			break;
+
+		trace_addr = cbe_read_pm(cpu, trace_address);
+	}
+
+	return entry;
+}
+
+
+static enum hrtimer_restart profile_spus(struct hrtimer *timer)
+{
+	ktime_t kt;
+	int cpu, node, k, num_samples, spu_num;
+
+	if (!spu_prof_running)
+		goto stop;
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		node = cbe_cpu_to_node(cpu);
+
+		/* There should only be one kernel thread at a time processing
+		 * the samples.	 In the very unlikely case that the processing
+		 * is taking a very long time and multiple kernel threads are
+		 * started to process the samples.  Make sure only one kernel
+		 * thread is working on the samples array at a time.  The
+		 * sample array must be loaded and then processed for a given
+		 * cpu.	 The sample array is not per cpu.
+		 */
+		spin_lock_irqsave(&sample_array_lock,
+				  sample_array_lock_flags);
+		num_samples = cell_spu_pc_collection(cpu);
+
+		if (num_samples == 0) {
+			spin_unlock_irqrestore(&sample_array_lock,
+					       sample_array_lock_flags);
+			continue;
+		}
+
+		for (k = 0; k < SPUS_PER_NODE; k++) {
+			spu_num = k + (node * SPUS_PER_NODE);
+			spu_sync_buffer(spu_num,
+					samples + (k * TRACE_ARRAY_SIZE),
+					num_samples);
+		}
+
+		spin_unlock_irqrestore(&sample_array_lock,
+				       sample_array_lock_flags);
+
+	}
+	smp_wmb();	/* insure spu event buffer updates are written */
+			/* don't want events intermingled... */
+
+	kt = ktime_set(0, profiling_interval);
+	if (!spu_prof_running)
+		goto stop;
+	hrtimer_forward(timer, timer->base->get_time(), kt);
+	return HRTIMER_RESTART;
+
+ stop:
+	printk(KERN_INFO "SPU_PROF: spu-prof timer ending\n");
+	return HRTIMER_NORESTART;
+}
+
+static struct hrtimer timer;
+/*
+ * Entry point for SPU profiling.
+ * NOTE:  SPU profiling is done system-wide, not per-CPU.
+ *
+ * cycles_reset is the count value specified by the user when
+ * setting up OProfile to count SPU_CYCLES.
+ */
+int start_spu_profiling(unsigned int cycles_reset)
+{
+	ktime_t kt;
+
+	pr_debug("timer resolution: %lu\n", TICK_NSEC);
+	kt = ktime_set(0, profiling_interval);
+	hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	timer.expires = kt;
+	timer.function = profile_spus;
+
+	/* Allocate arrays for collecting SPU PC samples */
+	samples = kzalloc(SPUS_PER_NODE *
+			  TRACE_ARRAY_SIZE * sizeof(u32), GFP_KERNEL);
+
+	if (!samples)
+		return -ENOMEM;
+
+	spu_prof_running = 1;
+	hrtimer_start(&timer, kt, HRTIMER_MODE_REL);
+
+	return 0;
+}
+
+void stop_spu_profiling(void)
+{
+	spu_prof_running = 0;
+	hrtimer_cancel(&timer);
+	kfree(samples);
+	pr_debug("SPU_PROF: stop_spu_profiling issued\n");
+}
diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c
new file mode 100644
index 000000000000..133665754a75
--- /dev/null
+++ b/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -0,0 +1,484 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* The purpose of this file is to handle SPU event task switching
+ * and to record SPU context information into the OProfile
+ * event buffer.
+ *
+ * Additionally, the spu_sync_buffer function is provided as a helper
+ * for recoding actual SPU program counter samples to the event buffer.
+ */
+#include <linux/dcookies.h>
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/numa.h>
+#include <linux/oprofile.h>
+#include <linux/spinlock.h>
+#include "pr_util.h"
+
+#define RELEASE_ALL 9999
+
+static DEFINE_SPINLOCK(buffer_lock);
+static DEFINE_SPINLOCK(cache_lock);
+static int num_spu_nodes;
+int spu_prof_num_nodes;
+int last_guard_val[MAX_NUMNODES * 8];
+
+/* Container for caching information about an active SPU task. */
+struct cached_info {
+	struct vma_to_fileoffset_map *map;
+	struct spu *the_spu;	/* needed to access pointer to local_store */
+	struct kref cache_ref;
+};
+
+static struct cached_info *spu_info[MAX_NUMNODES * 8];
+
+static void destroy_cached_info(struct kref *kref)
+{
+	struct cached_info *info;
+
+	info = container_of(kref, struct cached_info, cache_ref);
+	vma_map_free(info->map);
+	kfree(info);
+	module_put(THIS_MODULE);
+}
+
+/* Return the cached_info for the passed SPU number.
+ * ATTENTION:  Callers are responsible for obtaining the
+ *	       cache_lock if needed prior to invoking this function.
+ */
+static struct cached_info *get_cached_info(struct spu *the_spu, int spu_num)
+{
+	struct kref *ref;
+	struct cached_info *ret_info;
+
+	if (spu_num >= num_spu_nodes) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Invalid index %d into spu info cache\n",
+		       __FUNCTION__, __LINE__, spu_num);
+		ret_info = NULL;
+		goto out;
+	}
+	if (!spu_info[spu_num] && the_spu) {
+		ref = spu_get_profile_private_kref(the_spu->ctx);
+		if (ref) {
+			spu_info[spu_num] = container_of(ref, struct cached_info, cache_ref);
+			kref_get(&spu_info[spu_num]->cache_ref);
+		}
+	}
+
+	ret_info = spu_info[spu_num];
+ out:
+	return ret_info;
+}
+
+
+/* Looks for cached info for the passed spu.  If not found, the
+ * cached info is created for the passed spu.
+ * Returns 0 for success; otherwise, -1 for error.
+ */
+static int
+prepare_cached_spu_info(struct spu *spu, unsigned long objectId)
+{
+	unsigned long flags;
+	struct vma_to_fileoffset_map *new_map;
+	int retval = 0;
+	struct cached_info *info;
+
+	/* We won't bother getting cache_lock here since
+	 * don't do anything with the cached_info that's returned.
+	 */
+	info = get_cached_info(spu, spu->number);
+
+	if (info) {
+		pr_debug("Found cached SPU info.\n");
+		goto out;
+	}
+
+	/* Create cached_info and set spu_info[spu->number] to point to it.
+	 * spu->number is a system-wide value, not a per-node value.
+	 */
+	info = kzalloc(sizeof(struct cached_info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: create vma_map failed\n",
+		       __FUNCTION__, __LINE__);
+		retval = -ENOMEM;
+		goto err_alloc;
+	}
+	new_map = create_vma_map(spu, objectId);
+	if (!new_map) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: create vma_map failed\n",
+		       __FUNCTION__, __LINE__);
+		retval = -ENOMEM;
+		goto err_alloc;
+	}
+
+	pr_debug("Created vma_map\n");
+	info->map = new_map;
+	info->the_spu = spu;
+	kref_init(&info->cache_ref);
+	spin_lock_irqsave(&cache_lock, flags);
+	spu_info[spu->number] = info;
+	/* Increment count before passing off ref to SPUFS. */
+	kref_get(&info->cache_ref);
+
+	/* We increment the module refcount here since SPUFS is
+	 * responsible for the final destruction of the cached_info,
+	 * and it must be able to access the destroy_cached_info()
+	 * function defined in the OProfile module.  We decrement
+	 * the module refcount in destroy_cached_info.
+	 */
+	try_module_get(THIS_MODULE);
+	spu_set_profile_private_kref(spu->ctx, &info->cache_ref,
+				destroy_cached_info);
+	spin_unlock_irqrestore(&cache_lock, flags);
+	goto out;
+
+err_alloc:
+	kfree(info);
+out:
+	return retval;
+}
+
+/*
+ * NOTE:  The caller is responsible for locking the
+ *	  cache_lock prior to calling this function.
+ */
+static int release_cached_info(int spu_index)
+{
+	int index, end;
+
+	if (spu_index == RELEASE_ALL) {
+		end = num_spu_nodes;
+		index = 0;
+	} else {
+		if (spu_index >= num_spu_nodes) {
+			printk(KERN_ERR "SPU_PROF: "
+				"%s, line %d: "
+				"Invalid index %d into spu info cache\n",
+				__FUNCTION__, __LINE__, spu_index);
+			goto out;
+		}
+		end = spu_index + 1;
+		index = spu_index;
+	}
+	for (; index < end; index++) {
+		if (spu_info[index]) {
+			kref_put(&spu_info[index]->cache_ref,
+				 destroy_cached_info);
+			spu_info[index] = NULL;
+		}
+	}
+
+out:
+	return 0;
+}
+
+/* The source code for fast_get_dcookie was "borrowed"
+ * from drivers/oprofile/buffer_sync.c.
+ */
+
+/* Optimisation. We can manage without taking the dcookie sem
+ * because we cannot reach this code without at least one
+ * dcookie user still being registered (namely, the reader
+ * of the event buffer).
+ */
+static inline unsigned long fast_get_dcookie(struct dentry *dentry,
+					     struct vfsmount *vfsmnt)
+{
+	unsigned long cookie;
+
+	if (dentry->d_cookie)
+		return (unsigned long)dentry;
+	get_dcookie(dentry, vfsmnt, &cookie);
+	return cookie;
+}
+
+/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
+ * which corresponds loosely to "application name". Also, determine
+ * the offset for the SPU ELF object.  If computed offset is
+ * non-zero, it implies an embedded SPU object; otherwise, it's a
+ * separate SPU binary, in which case we retrieve it's dcookie.
+ * For the embedded case, we must determine if SPU ELF is embedded
+ * in the executable application or another file (i.e., shared lib).
+ * If embedded in a shared lib, we must get the dcookie and return
+ * that to the caller.
+ */
+static unsigned long
+get_exec_dcookie_and_offset(struct spu *spu, unsigned int *offsetp,
+			    unsigned long *spu_bin_dcookie,
+			    unsigned long spu_ref)
+{
+	unsigned long app_cookie = 0;
+	unsigned int my_offset = 0;
+	struct file *app = NULL;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = spu->mm;
+
+	if (!mm)
+		goto out;
+
+	down_read(&mm->mmap_sem);
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (!vma->vm_file)
+			continue;
+		if (!(vma->vm_flags & VM_EXECUTABLE))
+			continue;
+		app_cookie = fast_get_dcookie(vma->vm_file->f_dentry,
+					  vma->vm_file->f_vfsmnt);
+		pr_debug("got dcookie for %s\n",
+			 vma->vm_file->f_dentry->d_name.name);
+		app = vma->vm_file;
+		break;
+	}
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (vma->vm_start > spu_ref || vma->vm_end <= spu_ref)
+			continue;
+		my_offset = spu_ref - vma->vm_start;
+		if (!vma->vm_file)
+			goto fail_no_image_cookie;
+
+		pr_debug("Found spu ELF at %X(object-id:%lx) for file %s\n",
+			 my_offset, spu_ref,
+			 vma->vm_file->f_dentry->d_name.name);
+		*offsetp = my_offset;
+		break;
+	}
+
+	*spu_bin_dcookie = fast_get_dcookie(vma->vm_file->f_dentry,
+						 vma->vm_file->f_vfsmnt);
+	pr_debug("got dcookie for %s\n", vma->vm_file->f_dentry->d_name.name);
+
+	up_read(&mm->mmap_sem);
+
+out:
+	return app_cookie;
+
+fail_no_image_cookie:
+	up_read(&mm->mmap_sem);
+
+	printk(KERN_ERR "SPU_PROF: "
+		"%s, line %d: Cannot find dcookie for SPU binary\n",
+		__FUNCTION__, __LINE__);
+	goto out;
+}
+
+
+
+/* This function finds or creates cached context information for the
+ * passed SPU and records SPU context information into the OProfile
+ * event buffer.
+ */
+static int process_context_switch(struct spu *spu, unsigned long objectId)
+{
+	unsigned long flags;
+	int retval;
+	unsigned int offset = 0;
+	unsigned long spu_cookie = 0, app_dcookie;
+
+	retval = prepare_cached_spu_info(spu, objectId);
+	if (retval)
+		goto out;
+
+	/* Get dcookie first because a mutex_lock is taken in that
+	 * code path, so interrupts must not be disabled.
+	 */
+	app_dcookie = get_exec_dcookie_and_offset(spu, &offset, &spu_cookie, objectId);
+	if (!app_dcookie || !spu_cookie) {
+		retval  = -ENOENT;
+		goto out;
+	}
+
+	/* Record context info in event buffer */
+	spin_lock_irqsave(&buffer_lock, flags);
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(SPU_CTX_SWITCH_CODE);
+	add_event_entry(spu->number);
+	add_event_entry(spu->pid);
+	add_event_entry(spu->tgid);
+	add_event_entry(app_dcookie);
+	add_event_entry(spu_cookie);
+	add_event_entry(offset);
+	spin_unlock_irqrestore(&buffer_lock, flags);
+	smp_wmb();	/* insure spu event buffer updates are written */
+			/* don't want entries intermingled... */
+out:
+	return retval;
+}
+
+/*
+ * This function is invoked on either a bind_context or unbind_context.
+ * If called for an unbind_context, the val arg is 0; otherwise,
+ * it is the object-id value for the spu context.
+ * The data arg is of type 'struct spu *'.
+ */
+static int spu_active_notify(struct notifier_block *self, unsigned long val,
+				void *data)
+{
+	int retval;
+	unsigned long flags;
+	struct spu *the_spu = data;
+
+	pr_debug("SPU event notification arrived\n");
+	if (!val) {
+		spin_lock_irqsave(&cache_lock, flags);
+		retval = release_cached_info(the_spu->number);
+		spin_unlock_irqrestore(&cache_lock, flags);
+	} else {
+		retval = process_context_switch(the_spu, val);
+	}
+	return retval;
+}
+
+static struct notifier_block spu_active = {
+	.notifier_call = spu_active_notify,
+};
+
+static int number_of_online_nodes(void)
+{
+        u32 cpu; u32 tmp;
+        int nodes = 0;
+        for_each_online_cpu(cpu) {
+                tmp = cbe_cpu_to_node(cpu) + 1;
+                if (tmp > nodes)
+                        nodes++;
+        }
+        return nodes;
+}
+
+/* The main purpose of this function is to synchronize
+ * OProfile with SPUFS by registering to be notified of
+ * SPU task switches.
+ *
+ * NOTE: When profiling SPUs, we must ensure that only
+ * spu_sync_start is invoked and not the generic sync_start
+ * in drivers/oprofile/oprof.c.	 A return value of
+ * SKIP_GENERIC_SYNC or SYNC_START_ERROR will
+ * accomplish this.
+ */
+int spu_sync_start(void)
+{
+	int k;
+	int ret = SKIP_GENERIC_SYNC;
+	int register_ret;
+	unsigned long flags = 0;
+
+	spu_prof_num_nodes = number_of_online_nodes();
+	num_spu_nodes = spu_prof_num_nodes * 8;
+
+	spin_lock_irqsave(&buffer_lock, flags);
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(SPU_PROFILING_CODE);
+	add_event_entry(num_spu_nodes);
+	spin_unlock_irqrestore(&buffer_lock, flags);
+
+	/* Register for SPU events  */
+	register_ret = spu_switch_event_register(&spu_active);
+	if (register_ret) {
+		ret = SYNC_START_ERROR;
+		goto out;
+	}
+
+	for (k = 0; k < (MAX_NUMNODES * 8); k++)
+		last_guard_val[k] = 0;
+	pr_debug("spu_sync_start -- running.\n");
+out:
+	return ret;
+}
+
+/* Record SPU program counter samples to the oprofile event buffer. */
+void spu_sync_buffer(int spu_num, unsigned int *samples,
+		     int num_samples)
+{
+	unsigned long long file_offset;
+	unsigned long flags;
+	int i;
+	struct vma_to_fileoffset_map *map;
+	struct spu *the_spu;
+	unsigned long long spu_num_ll = spu_num;
+	unsigned long long spu_num_shifted = spu_num_ll << 32;
+	struct cached_info *c_info;
+
+	/* We need to obtain the cache_lock here because it's
+	 * possible that after getting the cached_info, the SPU job
+	 * corresponding to this cached_info may end, thus resulting
+	 * in the destruction of the cached_info.
+	 */
+	spin_lock_irqsave(&cache_lock, flags);
+	c_info = get_cached_info(NULL, spu_num);
+	if (!c_info) {
+		/* This legitimately happens when the SPU task ends before all
+		 * samples are recorded.
+		 * No big deal -- so we just drop a few samples.
+		 */
+		pr_debug("SPU_PROF: No cached SPU contex "
+			  "for SPU #%d. Dropping samples.\n", spu_num);
+		goto out;
+	}
+
+	map = c_info->map;
+	the_spu = c_info->the_spu;
+	spin_lock(&buffer_lock);
+	for (i = 0; i < num_samples; i++) {
+		unsigned int sample = *(samples+i);
+		int grd_val = 0;
+		file_offset = 0;
+		if (sample == 0)
+			continue;
+		file_offset = vma_map_lookup( map, sample, the_spu, &grd_val);
+
+		/* If overlays are used by this SPU application, the guard
+		 * value is non-zero, indicating which overlay section is in
+		 * use.	 We need to discard samples taken during the time
+		 * period which an overlay occurs (i.e., guard value changes).
+		 */
+		if (grd_val && grd_val != last_guard_val[spu_num]) {
+			last_guard_val[spu_num] = grd_val;
+			/* Drop the rest of the samples. */
+			break;
+		}
+
+		add_event_entry(file_offset | spu_num_shifted);
+	}
+	spin_unlock(&buffer_lock);
+out:
+	spin_unlock_irqrestore(&cache_lock, flags);
+}
+
+
+int spu_sync_stop(void)
+{
+	unsigned long flags = 0;
+	int ret = spu_switch_event_unregister(&spu_active);
+	if (ret) {
+		printk(KERN_ERR "SPU_PROF: "
+			"%s, line %d: spu_switch_event_unregister returned %d\n",
+			__FUNCTION__, __LINE__, ret);
+		goto out;
+	}
+
+	spin_lock_irqsave(&cache_lock, flags);
+	ret = release_cached_info(RELEASE_ALL);
+	spin_unlock_irqrestore(&cache_lock, flags);
+out:
+	pr_debug("spu_sync_stop -- done.\n");
+	return ret;
+}
+
+
diff --git a/arch/powerpc/oprofile/cell/vma_map.c b/arch/powerpc/oprofile/cell/vma_map.c
new file mode 100644
index 000000000000..76ec1d16aef7
--- /dev/null
+++ b/arch/powerpc/oprofile/cell/vma_map.c
@@ -0,0 +1,287 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Author: Maynard Johnson <maynardj@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* The code in this source file is responsible for generating
+ * vma-to-fileOffset maps for both overlay and non-overlay SPU
+ * applications.
+ */
+
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/elf.h>
+#include "pr_util.h"
+
+
+void vma_map_free(struct vma_to_fileoffset_map *map)
+{
+	while (map) {
+		struct vma_to_fileoffset_map *next = map->next;
+		kfree(map);
+		map = next;
+	}
+}
+
+unsigned int
+vma_map_lookup(struct vma_to_fileoffset_map *map, unsigned int vma,
+	       const struct spu *aSpu, int *grd_val)
+{
+	/*
+	 * Default the offset to the physical address + a flag value.
+	 * Addresses of dynamically generated code can't be found in the vma
+	 * map.  For those addresses the flagged value will be sent on to
+	 * the user space tools so they can be reported rather than just
+	 * thrown away.
+	 */
+	u32 offset = 0x10000000 + vma;
+	u32 ovly_grd;
+
+	for (; map; map = map->next) {
+		if (vma < map->vma || vma >= map->vma + map->size)
+			continue;
+
+		if (map->guard_ptr) {
+			ovly_grd = *(u32 *)(aSpu->local_store + map->guard_ptr);
+			if (ovly_grd != map->guard_val)
+				continue;
+			*grd_val = ovly_grd;
+		}
+		offset = vma - map->vma + map->offset;
+		break;
+	}
+
+	return offset;
+}
+
+static struct vma_to_fileoffset_map *
+vma_map_add(struct vma_to_fileoffset_map *map, unsigned int vma,
+	    unsigned int size, unsigned int offset, unsigned int guard_ptr,
+	    unsigned int guard_val)
+{
+	struct vma_to_fileoffset_map *new =
+		kzalloc(sizeof(struct vma_to_fileoffset_map), GFP_KERNEL);
+	if (!new) {
+		printk(KERN_ERR "SPU_PROF: %s, line %d: malloc failed\n",
+		       __FUNCTION__, __LINE__);
+		vma_map_free(map);
+		return NULL;
+	}
+
+	new->next = map;
+	new->vma = vma;
+	new->size = size;
+	new->offset = offset;
+	new->guard_ptr = guard_ptr;
+	new->guard_val = guard_val;
+
+	return new;
+}
+
+
+/* Parse SPE ELF header and generate a list of vma_maps.
+ * A pointer to the first vma_map in the generated list
+ * of vma_maps is returned.  */
+struct vma_to_fileoffset_map *create_vma_map(const struct spu *aSpu,
+					     unsigned long spu_elf_start)
+{
+	static const unsigned char expected[EI_PAD] = {
+		[EI_MAG0] = ELFMAG0,
+		[EI_MAG1] = ELFMAG1,
+		[EI_MAG2] = ELFMAG2,
+		[EI_MAG3] = ELFMAG3,
+		[EI_CLASS] = ELFCLASS32,
+		[EI_DATA] = ELFDATA2MSB,
+		[EI_VERSION] = EV_CURRENT,
+		[EI_OSABI] = ELFOSABI_NONE
+	};
+
+	int grd_val;
+	struct vma_to_fileoffset_map *map = NULL;
+	struct spu_overlay_info ovly;
+	unsigned int overlay_tbl_offset = -1;
+	unsigned long phdr_start, shdr_start;
+	Elf32_Ehdr ehdr;
+	Elf32_Phdr phdr;
+	Elf32_Shdr shdr, shdr_str;
+	Elf32_Sym sym;
+	int i, j;
+	char name[32];
+
+	unsigned int ovly_table_sym = 0;
+	unsigned int ovly_buf_table_sym = 0;
+	unsigned int ovly_table_end_sym = 0;
+	unsigned int ovly_buf_table_end_sym = 0;
+	unsigned long ovly_table;
+	unsigned int n_ovlys;
+
+	/* Get and validate ELF header.	 */
+
+	if (copy_from_user(&ehdr, (void *) spu_elf_start, sizeof (ehdr)))
+		goto fail;
+
+	if (memcmp(ehdr.e_ident, expected, EI_PAD) != 0) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected e_ident parsing SPU ELF\n",
+		       __FUNCTION__, __LINE__);
+		goto fail;
+	}
+	if (ehdr.e_machine != EM_SPU) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected e_machine parsing SPU ELF\n",
+		       __FUNCTION__,  __LINE__);
+		goto fail;
+	}
+	if (ehdr.e_type != ET_EXEC) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Unexpected e_type parsing SPU ELF\n",
+		       __FUNCTION__, __LINE__);
+		goto fail;
+	}
+	phdr_start = spu_elf_start + ehdr.e_phoff;
+	shdr_start = spu_elf_start + ehdr.e_shoff;
+
+	/* Traverse program headers.  */
+	for (i = 0; i < ehdr.e_phnum; i++) {
+		if (copy_from_user(&phdr,
+				   (void *) (phdr_start + i * sizeof(phdr)),
+				   sizeof(phdr)))
+			goto fail;
+
+		if (phdr.p_type != PT_LOAD)
+			continue;
+		if (phdr.p_flags & (1 << 27))
+			continue;
+
+		map = vma_map_add(map, phdr.p_vaddr, phdr.p_memsz,
+				  phdr.p_offset, 0, 0);
+		if (!map)
+			goto fail;
+	}
+
+	pr_debug("SPU_PROF: Created non-overlay maps\n");
+	/* Traverse section table and search for overlay-related symbols.  */
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		if (copy_from_user(&shdr,
+				   (void *) (shdr_start + i * sizeof(shdr)),
+				   sizeof(shdr)))
+			goto fail;
+
+		if (shdr.sh_type != SHT_SYMTAB)
+			continue;
+		if (shdr.sh_entsize != sizeof (sym))
+			continue;
+
+		if (copy_from_user(&shdr_str,
+				   (void *) (shdr_start + shdr.sh_link *
+					     sizeof(shdr)),
+				   sizeof(shdr)))
+			goto fail;
+
+		if (shdr_str.sh_type != SHT_STRTAB)
+			goto fail;;
+
+		for (j = 0; j < shdr.sh_size / sizeof (sym); j++) {
+			if (copy_from_user(&sym, (void *) (spu_elf_start +
+						       shdr.sh_offset + j *
+							   sizeof (sym)),
+					   sizeof (sym)))
+				goto fail;
+
+			if (copy_from_user(name, (void *)
+					   (spu_elf_start + shdr_str.sh_offset +
+					    sym.st_name),
+					   20))
+				goto fail;
+
+			if (memcmp(name, "_ovly_table", 12) == 0)
+				ovly_table_sym = sym.st_value;
+			if (memcmp(name, "_ovly_buf_table", 16) == 0)
+				ovly_buf_table_sym = sym.st_value;
+			if (memcmp(name, "_ovly_table_end", 16) == 0)
+				ovly_table_end_sym = sym.st_value;
+			if (memcmp(name, "_ovly_buf_table_end", 20) == 0)
+				ovly_buf_table_end_sym = sym.st_value;
+		}
+	}
+
+	/* If we don't have overlays, we're done.  */
+	if (ovly_table_sym == 0 || ovly_buf_table_sym == 0
+	    || ovly_table_end_sym == 0 || ovly_buf_table_end_sym == 0) {
+		pr_debug("SPU_PROF: No overlay table found\n");
+		goto out;
+	} else {
+		pr_debug("SPU_PROF: Overlay table found\n");
+	}
+
+	/* The _ovly_table symbol represents a table with one entry
+	 * per overlay section.	 The _ovly_buf_table symbol represents
+	 * a table with one entry per overlay region.
+	 * The struct spu_overlay_info gives the structure of the _ovly_table
+	 * entries.  The structure of _ovly_table_buf is simply one
+	 * u32 word per entry.
+	 */
+	overlay_tbl_offset = vma_map_lookup(map, ovly_table_sym,
+					    aSpu, &grd_val);
+	if (overlay_tbl_offset < 0) {
+		printk(KERN_ERR "SPU_PROF: "
+		       "%s, line %d: Error finding SPU overlay table\n",
+		       __FUNCTION__, __LINE__);
+		goto fail;
+	}
+	ovly_table = spu_elf_start + overlay_tbl_offset;
+
+	n_ovlys = (ovly_table_end_sym -
+		   ovly_table_sym) / sizeof (ovly);
+
+	/* Traverse overlay table.  */
+	for (i = 0; i < n_ovlys; i++) {
+		if (copy_from_user(&ovly, (void *)
+				   (ovly_table + i * sizeof (ovly)),
+				   sizeof (ovly)))
+			goto fail;
+
+		/* The ovly.vma/size/offset arguments are analogous to the same
+		 * arguments used above for non-overlay maps.  The final two
+		 * args are referred to as the guard pointer and the guard
+		 * value.
+		 * The guard pointer is an entry in the _ovly_buf_table,
+		 * computed using ovly.buf as the index into the table.	 Since
+		 * ovly.buf values begin at '1' to reference the first (or 0th)
+		 * entry in the _ovly_buf_table, the computation subtracts 1
+		 * from ovly.buf.
+		 * The guard value is stored in the _ovly_buf_table entry and
+		 * is an index (starting at 1) back to the _ovly_table entry
+		 * that is pointing at this _ovly_buf_table entry.  So, for
+		 * example, for an overlay scenario with one overlay segment
+		 * and two overlay sections:
+		 *	- Section 1 points to the first entry of the
+		 *	  _ovly_buf_table, which contains a guard value
+		 *	  of '1', referencing the first (index=0) entry of
+		 *	  _ovly_table.
+		 *	- Section 2 points to the second entry of the
+		 *	  _ovly_buf_table, which contains a guard value
+		 *	  of '2', referencing the second (index=1) entry of
+		 *	  _ovly_table.
+		 */
+		map = vma_map_add(map, ovly.vma, ovly.size, ovly.offset,
+				  ovly_buf_table_sym + (ovly.buf-1) * 4, i+1);
+		if (!map)
+			goto fail;
+	}
+	goto out;
+
+ fail:
+	map = NULL;
+ out:
+	return map;
+}
diff --git a/arch/powerpc/oprofile/common.c b/arch/powerpc/oprofile/common.c
index 1a7ef7e246d2..a28cce1d6c24 100644
--- a/arch/powerpc/oprofile/common.c
+++ b/arch/powerpc/oprofile/common.c
@@ -29,6 +29,8 @@ static struct op_powerpc_model *model;
 static struct op_counter_config ctr[OP_MAX_COUNTER];
 static struct op_system_config sys;
 
+static int op_per_cpu_rc;
+
 static void op_handle_interrupt(struct pt_regs *regs)
 {
 	model->handle_interrupt(regs, ctr);
@@ -36,25 +38,41 @@ static void op_handle_interrupt(struct pt_regs *regs)
 
 static void op_powerpc_cpu_setup(void *dummy)
 {
-	model->cpu_setup(ctr);
+	int ret;
+
+	ret = model->cpu_setup(ctr);
+
+	if (ret != 0)
+		op_per_cpu_rc = ret;
 }
 
 static int op_powerpc_setup(void)
 {
 	int err;
 
+	op_per_cpu_rc = 0;
+
 	/* Grab the hardware */
 	err = reserve_pmc_hardware(op_handle_interrupt);
 	if (err)
 		return err;
 
 	/* Pre-compute the values to stuff in the hardware registers.  */
-	model->reg_setup(ctr, &sys, model->num_counters);
+	op_per_cpu_rc = model->reg_setup(ctr, &sys, model->num_counters);
 
-	/* Configure the registers on all cpus.  */
+	if (op_per_cpu_rc)
+		goto out;
+
+	/* Configure the registers on all cpus.	 If an error occurs on one
+	 * of the cpus, op_per_cpu_rc will be set to the error */
 	on_each_cpu(op_powerpc_cpu_setup, NULL, 0, 1);
 
-	return 0;
+out:	if (op_per_cpu_rc) {
+		/* error on setup release the performance counter hardware */
+		release_pmc_hardware();
+	}
+
+	return op_per_cpu_rc;
 }
 
 static void op_powerpc_shutdown(void)
@@ -64,16 +82,29 @@ static void op_powerpc_shutdown(void)
 
 static void op_powerpc_cpu_start(void *dummy)
 {
-	model->start(ctr);
+	/* If any of the cpus have return an error, set the
+	 * global flag to the error so it can be returned
+	 * to the generic OProfile caller.
+	 */
+	int ret;
+
+	ret = model->start(ctr);
+	if (ret != 0)
+		op_per_cpu_rc = ret;
 }
 
 static int op_powerpc_start(void)
 {
+	op_per_cpu_rc = 0;
+
 	if (model->global_start)
-		model->global_start(ctr);
-	if (model->start)
+		return model->global_start(ctr);
+	if (model->start) {
 		on_each_cpu(op_powerpc_cpu_start, NULL, 0, 1);
-	return 0;
+		return op_per_cpu_rc;
+	}
+	return -EIO; /* No start function is defined for this
+			power architecture */
 }
 
 static inline void op_powerpc_cpu_stop(void *dummy)
@@ -147,11 +178,13 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
 
 	switch (cur_cpu_spec->oprofile_type) {
 #ifdef CONFIG_PPC64
-#ifdef CONFIG_PPC_CELL_NATIVE
+#ifdef CONFIG_OPROFILE_CELL
 		case PPC_OPROFILE_CELL:
 			if (firmware_has_feature(FW_FEATURE_LPAR))
 				return -ENODEV;
 			model = &op_model_cell;
+			ops->sync_start = model->sync_start;
+			ops->sync_stop = model->sync_stop;
 			break;
 #endif
 		case PPC_OPROFILE_RS64:
diff --git a/arch/powerpc/oprofile/op_model_7450.c b/arch/powerpc/oprofile/op_model_7450.c
index 5d1bbaf35ccb..cc599eb8768b 100644
--- a/arch/powerpc/oprofile/op_model_7450.c
+++ b/arch/powerpc/oprofile/op_model_7450.c
@@ -81,7 +81,7 @@ static void pmc_stop_ctrs(void)
 
 /* Configures the counters on this CPU based on the global
  * settings */
-static void fsl7450_cpu_setup(struct op_counter_config *ctr)
+static int fsl7450_cpu_setup(struct op_counter_config *ctr)
 {
 	/* freeze all counters */
 	pmc_stop_ctrs();
@@ -89,12 +89,14 @@ static void fsl7450_cpu_setup(struct op_counter_config *ctr)
 	mtspr(SPRN_MMCR0, mmcr0_val);
 	mtspr(SPRN_MMCR1, mmcr1_val);
 	mtspr(SPRN_MMCR2, mmcr2_val);
+
+	return 0;
 }
 
 #define NUM_CTRS 6
 
 /* Configures the global settings for the countes on all CPUs. */
-static void fsl7450_reg_setup(struct op_counter_config *ctr,
+static int fsl7450_reg_setup(struct op_counter_config *ctr,
 			     struct op_system_config *sys,
 			     int num_ctrs)
 {
@@ -126,10 +128,12 @@ static void fsl7450_reg_setup(struct op_counter_config *ctr,
 		| mmcr1_event6(ctr[5].event);
 
 	mmcr2_val = 0;
+
+	return 0;
 }
 
 /* Sets the counters on this CPU to the chosen values, and starts them */
-static void fsl7450_start(struct op_counter_config *ctr)
+static int fsl7450_start(struct op_counter_config *ctr)
 {
 	int i;
 
@@ -148,6 +152,8 @@ static void fsl7450_start(struct op_counter_config *ctr)
 	pmc_start_ctrs();
 
 	oprofile_running = 1;
+
+	return 0;
 }
 
 /* Stop the counters on this CPU */
@@ -193,7 +199,7 @@ static void fsl7450_handle_interrupt(struct pt_regs *regs,
 	/* The freeze bit was set by the interrupt. */
 	/* Clear the freeze bit, and reenable the interrupt.
 	 * The counters won't actually start until the rfi clears
-	 * the PMM bit */
+	 * the PM/M bit */
 	pmc_start_ctrs();
 }
 
diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c
index c29293befba9..d928b54f3a0f 100644
--- a/arch/powerpc/oprofile/op_model_cell.c
+++ b/arch/powerpc/oprofile/op_model_cell.c
@@ -5,8 +5,8 @@
  *
  * Author: David Erb (djerb@us.ibm.com)
  * Modifications:
- *         Carl Love <carll@us.ibm.com>
- *         Maynard Johnson <maynardj@us.ibm.com>
+ *	   Carl Love <carll@us.ibm.com>
+ *	   Maynard Johnson <maynardj@us.ibm.com>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -38,12 +38,25 @@
 
 #include "../platforms/cell/interrupt.h"
 #include "../platforms/cell/cbe_regs.h"
+#include "cell/pr_util.h"
+
+static void cell_global_stop_spu(void);
+
+/*
+ * spu_cycle_reset is the number of cycles between samples.
+ * This variable is used for SPU profiling and should ONLY be set
+ * at the beginning of cell_reg_setup; otherwise, it's read-only.
+ */
+static unsigned int spu_cycle_reset;
+
+#define NUM_SPUS_PER_NODE    8
+#define SPU_CYCLES_EVENT_NUM 2	/*  event number for SPU_CYCLES */
 
 #define PPU_CYCLES_EVENT_NUM 1	/*  event number for CYCLES */
-#define PPU_CYCLES_GRP_NUM   1  /* special group number for identifying
-                                 * PPU_CYCLES event
-                                 */
-#define CBE_COUNT_ALL_CYCLES 0x42800000	/* PPU cycle event specifier */
+#define PPU_CYCLES_GRP_NUM   1	/* special group number for identifying
+				 * PPU_CYCLES event
+				 */
+#define CBE_COUNT_ALL_CYCLES 0x42800000 /* PPU cycle event specifier */
 
 #define NUM_THREADS 2         /* number of physical threads in
 			       * physical processor
@@ -51,6 +64,7 @@
 #define NUM_TRACE_BUS_WORDS 4
 #define NUM_INPUT_BUS_WORDS 2
 
+#define MAX_SPU_COUNT 0xFFFFFF	/* maximum 24 bit LFSR value */
 
 struct pmc_cntrl_data {
 	unsigned long vcntr;
@@ -62,11 +76,10 @@ struct pmc_cntrl_data {
 /*
  * ibm,cbe-perftools rtas parameters
  */
-
 struct pm_signal {
 	u16 cpu;		/* Processor to modify */
-	u16 sub_unit;		/* hw subunit this applies to (if applicable) */
-	short int signal_group;	/* Signal Group to Enable/Disable */
+	u16 sub_unit;		/* hw subunit this applies to (if applicable)*/
+	short int signal_group; /* Signal Group to Enable/Disable */
 	u8 bus_word;		/* Enable/Disable on this Trace/Trigger/Event
 				 * Bus Word(s) (bitmask)
 				 */
@@ -112,21 +125,42 @@ static DEFINE_PER_CPU(unsigned long[NR_PHYS_CTRS], pmc_values);
 
 static struct pmc_cntrl_data pmc_cntrl[NUM_THREADS][NR_PHYS_CTRS];
 
-/* Interpetation of hdw_thread:
+/*
+ * The CELL profiling code makes rtas calls to setup the debug bus to
+ * route the performance signals.  Additionally, SPU profiling requires
+ * a second rtas call to setup the hardware to capture the SPU PCs.
+ * The EIO error value is returned if the token lookups or the rtas
+ * call fail.  The EIO error number is the best choice of the existing
+ * error numbers.  The probability of rtas related error is very low.  But
+ * by returning EIO and printing additional information to dmsg the user
+ * will know that OProfile did not start and dmesg will tell them why.
+ * OProfile does not support returning errors on Stop.	Not a huge issue
+ * since failure to reset the debug bus or stop the SPU PC collection is
+ * not a fatel issue.  Chances are if the Stop failed, Start doesn't work
+ * either.
+ */
+
+/*
+ * Interpetation of hdw_thread:
  * 0 - even virtual cpus 0, 2, 4,...
  * 1 - odd virtual cpus 1, 3, 5, ...
+ *
+ * FIXME: this is strictly wrong, we need to clean this up in a number
+ * of places. It works for now. -arnd
  */
 static u32 hdw_thread;
 
 static u32 virt_cntr_inter_mask;
 static struct timer_list timer_virt_cntr;
 
-/* pm_signal needs to be global since it is initialized in
+/*
+ * pm_signal needs to be global since it is initialized in
  * cell_reg_setup at the time when the necessary information
  * is available.
  */
 static struct pm_signal pm_signal[NR_PHYS_CTRS];
-static int pm_rtas_token;
+static int pm_rtas_token;    /* token for debug bus setup call */
+static int spu_rtas_token;   /* token for SPU cycle profiling */
 
 static u32 reset_value[NR_PHYS_CTRS];
 static int num_counters;
@@ -147,8 +181,8 @@ rtas_ibm_cbe_perftools(int subfunc, int passthru,
 {
 	u64 paddr = __pa(address);
 
-	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc, passthru,
-			 paddr >> 32, paddr & 0xffffffff, length);
+	return rtas_call(pm_rtas_token, 5, 1, NULL, subfunc,
+			 passthru, paddr >> 32, paddr & 0xffffffff, length);
 }
 
 static void pm_rtas_reset_signals(u32 node)
@@ -156,12 +190,13 @@ static void pm_rtas_reset_signals(u32 node)
 	int ret;
 	struct pm_signal pm_signal_local;
 
-	/*  The debug bus is being set to the passthru disable state.
-	 *  However, the FW still expects atleast one legal signal routing
-	 *  entry or it will return an error on the arguments.  If we don't
-	 *  supply a valid entry, we must ignore all return values.  Ignoring
-	 *  all return values means we might miss an error we should be
-	 *  concerned about.
+	/*
+	 * The debug bus is being set to the passthru disable state.
+	 * However, the FW still expects atleast one legal signal routing
+	 * entry or it will return an error on the arguments.	If we don't
+	 * supply a valid entry, we must ignore all return values.  Ignoring
+	 * all return values means we might miss an error we should be
+	 * concerned about.
 	 */
 
 	/*  fw expects physical cpu #. */
@@ -175,18 +210,24 @@ static void pm_rtas_reset_signals(u32 node)
 				     &pm_signal_local,
 				     sizeof(struct pm_signal));
 
-	if (ret)
+	if (unlikely(ret))
+		/*
+		 * Not a fatal error. For Oprofile stop, the oprofile
+		 * functions do not support returning an error for
+		 * failure to stop OProfile.
+		 */
 		printk(KERN_WARNING "%s: rtas returned: %d\n",
 		       __FUNCTION__, ret);
 }
 
-static void pm_rtas_activate_signals(u32 node, u32 count)
+static int pm_rtas_activate_signals(u32 node, u32 count)
 {
 	int ret;
 	int i, j;
 	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
 
-	/* There is no debug setup required for the cycles event.
+	/*
+	 * There is no debug setup required for the cycles event.
 	 * Note that only events in the same group can be used.
 	 * Otherwise, there will be conflicts in correctly routing
 	 * the signals on the debug bus.  It is the responsiblity
@@ -213,10 +254,14 @@ static void pm_rtas_activate_signals(u32 node, u32 count)
 					     pm_signal_local,
 					     i * sizeof(struct pm_signal));
 
-		if (ret)
+		if (unlikely(ret)) {
 			printk(KERN_WARNING "%s: rtas returned: %d\n",
 			       __FUNCTION__, ret);
+			return -EIO;
+		}
 	}
+
+	return 0;
 }
 
 /*
@@ -260,11 +305,12 @@ static void set_pm_event(u32 ctr, int event, u32 unit_mask)
 	pm_regs.pm07_cntrl[ctr] |= PM07_CTR_POLARITY(polarity);
 	pm_regs.pm07_cntrl[ctr] |= PM07_CTR_INPUT_CONTROL(input_control);
 
-	/* Some of the islands signal selection is based on 64 bit words.
+	/*
+	 * Some of the islands signal selection is based on 64 bit words.
 	 * The debug bus words are 32 bits, the input words to the performance
 	 * counters are defined as 32 bits.  Need to convert the 64 bit island
 	 * specification to the appropriate 32 input bit and bus word for the
-	 * performance counter event selection.  See the CELL Performance
+	 * performance counter event selection.	 See the CELL Performance
 	 * monitoring signals manual and the Perf cntr hardware descriptions
 	 * for the details.
 	 */
@@ -298,6 +344,7 @@ static void set_pm_event(u32 ctr, int event, u32 unit_mask)
 					input_bus[j] = i;
 					pm_regs.group_control |=
 					    (i << (31 - i));
+
 					break;
 				}
 			}
@@ -309,7 +356,8 @@ out:
 
 static void write_pm_cntrl(int cpu)
 {
-	/* Oprofile will use 32 bit counters, set bits 7:10 to 0
+	/*
+	 * Oprofile will use 32 bit counters, set bits 7:10 to 0
 	 * pmregs.pm_cntrl is a global
 	 */
 
@@ -326,7 +374,8 @@ static void write_pm_cntrl(int cpu)
 	if (pm_regs.pm_cntrl.freeze == 1)
 		val |= CBE_PM_FREEZE_ALL_CTRS;
 
-	/* Routine set_count_mode must be called previously to set
+	/*
+	 * Routine set_count_mode must be called previously to set
 	 * the count mode based on the user selection of user and kernel.
 	 */
 	val |= CBE_PM_COUNT_MODE_SET(pm_regs.pm_cntrl.count_mode);
@@ -336,7 +385,8 @@ static void write_pm_cntrl(int cpu)
 static inline void
 set_count_mode(u32 kernel, u32 user)
 {
-	/* The user must specify user and kernel if they want them. If
+	/*
+	 * The user must specify user and kernel if they want them. If
 	 *  neither is specified, OProfile will count in hypervisor mode.
 	 *  pm_regs.pm_cntrl is a global
 	 */
@@ -364,7 +414,7 @@ static inline void enable_ctr(u32 cpu, u32 ctr, u32 * pm07_cntrl)
 
 /*
  * Oprofile is expected to collect data on all CPUs simultaneously.
- * However, there is one set of performance counters per node.  There are
+ * However, there is one set of performance counters per node.	There are
  * two hardware threads or virtual CPUs on each node.  Hence, OProfile must
  * multiplex in time the performance counter collection on the two virtual
  * CPUs.  The multiplexing of the performance counters is done by this
@@ -377,19 +427,19 @@ static inline void enable_ctr(u32 cpu, u32 ctr, u32 * pm07_cntrl)
  * pair of per-cpu arrays is used for storing the previous and next
  * pmc values for a given node.
  * NOTE: We use the per-cpu variable to improve cache performance.
+ *
+ * This routine will alternate loading the virtual counters for
+ * virtual CPUs
  */
 static void cell_virtual_cntr(unsigned long data)
 {
-	/* This routine will alternate loading the virtual counters for
-	 * virtual CPUs
-	 */
 	int i, prev_hdw_thread, next_hdw_thread;
 	u32 cpu;
 	unsigned long flags;
 
-	/* Make sure that the interrupt_hander and
-	 * the virt counter are not both playing with
-	 * the counters on the same node.
+	/*
+	 * Make sure that the interrupt_hander and the virt counter are
+	 * not both playing with the counters on the same node.
 	 */
 
 	spin_lock_irqsave(&virt_cntr_lock, flags);
@@ -400,22 +450,25 @@ static void cell_virtual_cntr(unsigned long data)
 	hdw_thread = 1 ^ hdw_thread;
 	next_hdw_thread = hdw_thread;
 
-	for (i = 0; i < num_counters; i++)
-	/* There are some per thread events.  Must do the
+	/*
+	 * There are some per thread events.  Must do the
 	 * set event, for the thread that is being started
 	 */
+	for (i = 0; i < num_counters; i++)
 		set_pm_event(i,
 			pmc_cntrl[next_hdw_thread][i].evnts,
 			pmc_cntrl[next_hdw_thread][i].masks);
 
-	/* The following is done only once per each node, but
+	/*
+	 * The following is done only once per each node, but
 	 * we need cpu #, not node #, to pass to the cbe_xxx functions.
 	 */
 	for_each_online_cpu(cpu) {
 		if (cbe_get_hw_thread_id(cpu))
 			continue;
 
-		/* stop counters, save counter values, restore counts
+		/*
+		 * stop counters, save counter values, restore counts
 		 * for previous thread
 		 */
 		cbe_disable_pm(cpu);
@@ -428,7 +481,7 @@ static void cell_virtual_cntr(unsigned long data)
 			    == 0xFFFFFFFF)
 				/* If the cntr value is 0xffffffff, we must
 				 * reset that to 0xfffffff0 when the current
-				 * thread is restarted.  This will generate a
+				 * thread is restarted.	 This will generate a
 				 * new interrupt and make sure that we never
 				 * restore the counters to the max value.  If
 				 * the counters were restored to the max value,
@@ -444,13 +497,15 @@ static void cell_virtual_cntr(unsigned long data)
 						      next_hdw_thread)[i]);
 		}
 
-		/* Switch to the other thread. Change the interrupt
+		/*
+		 * Switch to the other thread. Change the interrupt
 		 * and control regs to be scheduled on the CPU
 		 * corresponding to the thread to execute.
 		 */
 		for (i = 0; i < num_counters; i++) {
 			if (pmc_cntrl[next_hdw_thread][i].enabled) {
-				/* There are some per thread events.
+				/*
+				 * There are some per thread events.
 				 * Must do the set event, enable_cntr
 				 * for each cpu.
 				 */
@@ -482,17 +537,42 @@ static void start_virt_cntrs(void)
 }
 
 /* This function is called once for all cpus combined */
-static void
-cell_reg_setup(struct op_counter_config *ctr,
-	       struct op_system_config *sys, int num_ctrs)
+static int cell_reg_setup(struct op_counter_config *ctr,
+			struct op_system_config *sys, int num_ctrs)
 {
 	int i, j, cpu;
+	spu_cycle_reset = 0;
+
+	if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
+		spu_cycle_reset = ctr[0].count;
+
+		/*
+		 * Each node will need to make the rtas call to start
+		 * and stop SPU profiling.  Get the token once and store it.
+		 */
+		spu_rtas_token = rtas_token("ibm,cbe-spu-perftools");
+
+		if (unlikely(spu_rtas_token == RTAS_UNKNOWN_SERVICE)) {
+			printk(KERN_ERR
+			       "%s: rtas token ibm,cbe-spu-perftools unknown\n",
+			       __FUNCTION__);
+			return -EIO;
+		}
+	}
 
 	pm_rtas_token = rtas_token("ibm,cbe-perftools");
-	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
-		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE\n",
+
+	/*
+	 * For all events excetp PPU CYCLEs, each node will need to make
+	 * the rtas cbe-perftools call to setup and reset the debug bus.
+	 * Make the token lookup call once and store it in the global
+	 * variable pm_rtas_token.
+	 */
+	if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
+		printk(KERN_ERR
+		       "%s: rtas token ibm,cbe-perftools unknown\n",
 		       __FUNCTION__);
-		goto out;
+		return -EIO;
 	}
 
 	num_counters = num_ctrs;
@@ -520,7 +600,8 @@ cell_reg_setup(struct op_counter_config *ctr,
 			per_cpu(pmc_values, j)[i] = 0;
 	}
 
-	/* Setup the thread 1 events, map the thread 0 event to the
+	/*
+	 * Setup the thread 1 events, map the thread 0 event to the
 	 * equivalent thread 1 event.
 	 */
 	for (i = 0; i < num_ctrs; ++i) {
@@ -544,9 +625,10 @@ cell_reg_setup(struct op_counter_config *ctr,
 	for (i = 0; i < NUM_INPUT_BUS_WORDS; i++)
 		input_bus[i] = 0xff;
 
-	/* Our counters count up, and "count" refers to
+	/*
+	 * Our counters count up, and "count" refers to
 	 * how much before the next interrupt, and we interrupt
-	 * on overflow.  So we calculate the starting value
+	 * on overflow.	 So we calculate the starting value
 	 * which will give us "count" until overflow.
 	 * Then we set the events on the enabled counters.
 	 */
@@ -569,28 +651,27 @@ cell_reg_setup(struct op_counter_config *ctr,
 		for (i = 0; i < num_counters; ++i) {
 			per_cpu(pmc_values, cpu)[i] = reset_value[i];
 		}
-out:
-	;
+
+	return 0;
 }
 
+
+
 /* This function is called once for each cpu */
-static void cell_cpu_setup(struct op_counter_config *cntr)
+static int cell_cpu_setup(struct op_counter_config *cntr)
 {
 	u32 cpu = smp_processor_id();
 	u32 num_enabled = 0;
 	int i;
 
+	if (spu_cycle_reset)
+		return 0;
+
 	/* There is one performance monitor per processor chip (i.e. node),
 	 * so we only need to perform this function once per node.
 	 */
 	if (cbe_get_hw_thread_id(cpu))
-		goto out;
-
-	if (pm_rtas_token == RTAS_UNKNOWN_SERVICE) {
-		printk(KERN_WARNING "%s: RTAS_UNKNOWN_SERVICE\n",
-		       __FUNCTION__);
-		goto out;
-	}
+		return 0;
 
 	/* Stop all counters */
 	cbe_disable_pm(cpu);
@@ -609,16 +690,286 @@ static void cell_cpu_setup(struct op_counter_config *cntr)
 		}
 	}
 
-	pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled);
+	/*
+	 * The pm_rtas_activate_signals will return -EIO if the FW
+	 * call failed.
+	 */
+	return pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled);
+}
+
+#define ENTRIES	 303
+#define MAXLFSR	 0xFFFFFF
+
+/* precomputed table of 24 bit LFSR values */
+static int initial_lfsr[] = {
+ 8221349, 12579195, 5379618, 10097839, 7512963, 7519310, 3955098, 10753424,
+ 15507573, 7458917, 285419, 2641121, 9780088, 3915503, 6668768, 1548716,
+ 4885000, 8774424, 9650099, 2044357, 2304411, 9326253, 10332526, 4421547,
+ 3440748, 10179459, 13332843, 10375561, 1313462, 8375100, 5198480, 6071392,
+ 9341783, 1526887, 3985002, 1439429, 13923762, 7010104, 11969769, 4547026,
+ 2040072, 4025602, 3437678, 7939992, 11444177, 4496094, 9803157, 10745556,
+ 3671780, 4257846, 5662259, 13196905, 3237343, 12077182, 16222879, 7587769,
+ 14706824, 2184640, 12591135, 10420257, 7406075, 3648978, 11042541, 15906893,
+ 11914928, 4732944, 10695697, 12928164, 11980531, 4430912, 11939291, 2917017,
+ 6119256, 4172004, 9373765, 8410071, 14788383, 5047459, 5474428, 1737756,
+ 15967514, 13351758, 6691285, 8034329, 2856544, 14394753, 11310160, 12149558,
+ 7487528, 7542781, 15668898, 12525138, 12790975, 3707933, 9106617, 1965401,
+ 16219109, 12801644, 2443203, 4909502, 8762329, 3120803, 6360315, 9309720,
+ 15164599, 10844842, 4456529, 6667610, 14924259, 884312, 6234963, 3326042,
+ 15973422, 13919464, 5272099, 6414643, 3909029, 2764324, 5237926, 4774955,
+ 10445906, 4955302, 5203726, 10798229, 11443419, 2303395, 333836, 9646934,
+ 3464726, 4159182, 568492, 995747, 10318756, 13299332, 4836017, 8237783,
+ 3878992, 2581665, 11394667, 5672745, 14412947, 3159169, 9094251, 16467278,
+ 8671392, 15230076, 4843545, 7009238, 15504095, 1494895, 9627886, 14485051,
+ 8304291, 252817, 12421642, 16085736, 4774072, 2456177, 4160695, 15409741,
+ 4902868, 5793091, 13162925, 16039714, 782255, 11347835, 14884586, 366972,
+ 16308990, 11913488, 13390465, 2958444, 10340278, 1177858, 1319431, 10426302,
+ 2868597, 126119, 5784857, 5245324, 10903900, 16436004, 3389013, 1742384,
+ 14674502, 10279218, 8536112, 10364279, 6877778, 14051163, 1025130, 6072469,
+ 1988305, 8354440, 8216060, 16342977, 13112639, 3976679, 5913576, 8816697,
+ 6879995, 14043764, 3339515, 9364420, 15808858, 12261651, 2141560, 5636398,
+ 10345425, 10414756, 781725, 6155650, 4746914, 5078683, 7469001, 6799140,
+ 10156444, 9667150, 10116470, 4133858, 2121972, 1124204, 1003577, 1611214,
+ 14304602, 16221850, 13878465, 13577744, 3629235, 8772583, 10881308, 2410386,
+ 7300044, 5378855, 9301235, 12755149, 4977682, 8083074, 10327581, 6395087,
+ 9155434, 15501696, 7514362, 14520507, 15808945, 3244584, 4741962, 9658130,
+ 14336147, 8654727, 7969093, 15759799, 14029445, 5038459, 9894848, 8659300,
+ 13699287, 8834306, 10712885, 14753895, 10410465, 3373251, 309501, 9561475,
+ 5526688, 14647426, 14209836, 5339224, 207299, 14069911, 8722990, 2290950,
+ 3258216, 12505185, 6007317, 9218111, 14661019, 10537428, 11731949, 9027003,
+ 6641507, 9490160, 200241, 9720425, 16277895, 10816638, 1554761, 10431375,
+ 7467528, 6790302, 3429078, 14633753, 14428997, 11463204, 3576212, 2003426,
+ 6123687, 820520, 9992513, 15784513, 5778891, 6428165, 8388607
+};
+
+/*
+ * The hardware uses an LFSR counting sequence to determine when to capture
+ * the SPU PCs.	 An LFSR sequence is like a puesdo random number sequence
+ * where each number occurs once in the sequence but the sequence is not in
+ * numerical order. The SPU PC capture is done when the LFSR sequence reaches
+ * the last value in the sequence.  Hence the user specified value N
+ * corresponds to the LFSR number that is N from the end of the sequence.
+ *
+ * To avoid the time to compute the LFSR, a lookup table is used.  The 24 bit
+ * LFSR sequence is broken into four ranges.  The spacing of the precomputed
+ * values is adjusted in each range so the error between the user specifed
+ * number (N) of events between samples and the actual number of events based
+ * on the precomputed value will be les then about 6.2%.  Note, if the user
+ * specifies N < 2^16, the LFSR value that is 2^16 from the end will be used.
+ * This is to prevent the loss of samples because the trace buffer is full.
+ *
+ *	   User specified N		     Step between	   Index in
+ *					 precomputed values	 precomputed
+ *								    table
+ * 0		    to	2^16-1			----		      0
+ * 2^16	    to	2^16+2^19-1		2^12		    1 to 128
+ * 2^16+2^19	    to	2^16+2^19+2^22-1	2^15		  129 to 256
+ * 2^16+2^19+2^22  to	2^24-1			2^18		  257 to 302
+ *
+ *
+ * For example, the LFSR values in the second range are computed for 2^16,
+ * 2^16+2^12, ... , 2^19-2^16, 2^19 and stored in the table at indicies
+ * 1, 2,..., 127, 128.
+ *
+ * The 24 bit LFSR value for the nth number in the sequence can be
+ * calculated using the following code:
+ *
+ * #define size 24
+ * int calculate_lfsr(int n)
+ * {
+ *	int i;
+ *	unsigned int newlfsr0;
+ *	unsigned int lfsr = 0xFFFFFF;
+ *	unsigned int howmany = n;
+ *
+ *	for (i = 2; i < howmany + 2; i++) {
+ *		newlfsr0 = (((lfsr >> (size - 1 - 0)) & 1) ^
+ *		((lfsr >> (size - 1 - 1)) & 1) ^
+ *		(((lfsr >> (size - 1 - 6)) & 1) ^
+ *		((lfsr >> (size - 1 - 23)) & 1)));
+ *
+ *		lfsr >>= 1;
+ *		lfsr = lfsr | (newlfsr0 << (size - 1));
+ *	}
+ *	return lfsr;
+ * }
+ */
+
+#define V2_16  (0x1 << 16)
+#define V2_19  (0x1 << 19)
+#define V2_22  (0x1 << 22)
+
+static int calculate_lfsr(int n)
+{
+	/*
+	 * The ranges and steps are in powers of 2 so the calculations
+	 * can be done using shifts rather then divide.
+	 */
+	int index;
+
+	if ((n >> 16) == 0)
+		index = 0;
+	else if (((n - V2_16) >> 19) == 0)
+		index = ((n - V2_16) >> 12) + 1;
+	else if (((n - V2_16 - V2_19) >> 22) == 0)
+		index = ((n - V2_16 - V2_19) >> 15 ) + 1 + 128;
+	else if (((n - V2_16 - V2_19 - V2_22) >> 24) == 0)
+		index = ((n - V2_16 - V2_19 - V2_22) >> 18 ) + 1 + 256;
+	else
+		index = ENTRIES-1;
+
+	/* make sure index is valid */
+	if ((index > ENTRIES) || (index < 0))
+		index = ENTRIES-1;
+
+	return initial_lfsr[index];
+}
+
+static int pm_rtas_activate_spu_profiling(u32 node)
+{
+	int ret, i;
+	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
+
+	/*
+	 * Set up the rtas call to configure the debug bus to
+	 * route the SPU PCs.  Setup the pm_signal for each SPU
+	 */
+	for (i = 0; i < NUM_SPUS_PER_NODE; i++) {
+		pm_signal_local[i].cpu = node;
+		pm_signal_local[i].signal_group = 41;
+		/* spu i on word (i/2) */
+		pm_signal_local[i].bus_word = 1 << i / 2;
+		/* spu i */
+		pm_signal_local[i].sub_unit = i;
+		pm_signal_local[i].bit = 63;
+	}
+
+	ret = rtas_ibm_cbe_perftools(SUBFUNC_ACTIVATE,
+				     PASSTHRU_ENABLE, pm_signal_local,
+				     (NUM_SPUS_PER_NODE
+				      * sizeof(struct pm_signal)));
+
+	if (unlikely(ret)) {
+		printk(KERN_WARNING "%s: rtas returned: %d\n",
+		       __FUNCTION__, ret);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_CPU_FREQ
+static int
+oprof_cpufreq_notify(struct notifier_block *nb, unsigned long val, void *data)
+{
+	int ret = 0;
+	struct cpufreq_freqs *frq = data;
+	if ((val == CPUFREQ_PRECHANGE && frq->old < frq->new) ||
+	    (val == CPUFREQ_POSTCHANGE && frq->old > frq->new) ||
+	    (val == CPUFREQ_RESUMECHANGE || val == CPUFREQ_SUSPENDCHANGE))
+		set_spu_profiling_frequency(frq->new, spu_cycle_reset);
+	return ret;
+}
+
+static struct notifier_block cpu_freq_notifier_block = {
+	.notifier_call	= oprof_cpufreq_notify
+};
+#endif
+
+static int cell_global_start_spu(struct op_counter_config *ctr)
+{
+	int subfunc;
+	unsigned int lfsr_value;
+	int cpu;
+	int ret;
+	int rtas_error;
+	unsigned int cpu_khzfreq = 0;
+
+	/* The SPU profiling uses time-based profiling based on
+	 * cpu frequency, so if configured with the CPU_FREQ
+	 * option, we should detect frequency changes and react
+	 * accordingly.
+	 */
+#ifdef CONFIG_CPU_FREQ
+	ret = cpufreq_register_notifier(&cpu_freq_notifier_block,
+					CPUFREQ_TRANSITION_NOTIFIER);
+	if (ret < 0)
+		/* this is not a fatal error */
+		printk(KERN_ERR "CPU freq change registration failed: %d\n",
+		       ret);
+
+	else
+		cpu_khzfreq = cpufreq_quick_get(smp_processor_id());
+#endif
+
+	set_spu_profiling_frequency(cpu_khzfreq, spu_cycle_reset);
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		/*
+		 * Setup SPU cycle-based profiling.
+		 * Set perf_mon_control bit 0 to a zero before
+		 * enabling spu collection hardware.
+		 */
+		cbe_write_pm(cpu, pm_control, 0);
+
+		if (spu_cycle_reset > MAX_SPU_COUNT)
+			/* use largest possible value */
+			lfsr_value = calculate_lfsr(MAX_SPU_COUNT-1);
+		else
+			lfsr_value = calculate_lfsr(spu_cycle_reset);
+
+		/* must use a non zero value. Zero disables data collection. */
+		if (lfsr_value == 0)
+			lfsr_value = calculate_lfsr(1);
+
+		lfsr_value = lfsr_value << 8; /* shift lfsr to correct
+						* register location
+						*/
+
+		/* debug bus setup */
+		ret = pm_rtas_activate_spu_profiling(cbe_cpu_to_node(cpu));
+
+		if (unlikely(ret)) {
+			rtas_error = ret;
+			goto out;
+		}
+
+
+		subfunc = 2;	/* 2 - activate SPU tracing, 3 - deactivate */
+
+		/* start profiling */
+		ret = rtas_call(spu_rtas_token, 3, 1, NULL, subfunc,
+		  cbe_cpu_to_node(cpu), lfsr_value);
+
+		if (unlikely(ret != 0)) {
+			printk(KERN_ERR
+			       "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n",
+			       __FUNCTION__, ret);
+			rtas_error = -EIO;
+			goto out;
+		}
+	}
+
+	rtas_error = start_spu_profiling(spu_cycle_reset);
+	if (rtas_error)
+		goto out_stop;
+
+	oprofile_running = 1;
+	return 0;
+
+out_stop:
+	cell_global_stop_spu();		/* clean up the PMU/debug bus */
 out:
-	;
+	return rtas_error;
 }
 
-static void cell_global_start(struct op_counter_config *ctr)
+static int cell_global_start_ppu(struct op_counter_config *ctr)
 {
-	u32 cpu;
+	u32 cpu, i;
 	u32 interrupt_mask = 0;
-	u32 i;
 
 	/* This routine gets called once for the system.
 	 * There is one performance monitor per node, so we
@@ -651,19 +1002,79 @@ static void cell_global_start(struct op_counter_config *ctr)
 	oprofile_running = 1;
 	smp_wmb();
 
-	/* NOTE: start_virt_cntrs will result in cell_virtual_cntr() being
-	 * executed which manipulates the PMU.  We start the "virtual counter"
+	/*
+	 * NOTE: start_virt_cntrs will result in cell_virtual_cntr() being
+	 * executed which manipulates the PMU.	We start the "virtual counter"
 	 * here so that we do not need to synchronize access to the PMU in
 	 * the above for-loop.
 	 */
 	start_virt_cntrs();
+
+	return 0;
 }
 
-static void cell_global_stop(void)
+static int cell_global_start(struct op_counter_config *ctr)
+{
+	if (spu_cycle_reset)
+		return cell_global_start_spu(ctr);
+	else
+		return cell_global_start_ppu(ctr);
+}
+
+/*
+ * Note the generic OProfile stop calls do not support returning
+ * an error on stop.  Hence, will not return an error if the FW
+ * calls fail on stop.	Failure to reset the debug bus is not an issue.
+ * Failure to disable the SPU profiling is not an issue.  The FW calls
+ * to enable the performance counters and debug bus will work even if
+ * the hardware was not cleanly reset.
+ */
+static void cell_global_stop_spu(void)
+{
+	int subfunc, rtn_value;
+	unsigned int lfsr_value;
+	int cpu;
+
+	oprofile_running = 0;
+
+#ifdef CONFIG_CPU_FREQ
+	cpufreq_unregister_notifier(&cpu_freq_notifier_block,
+				    CPUFREQ_TRANSITION_NOTIFIER);
+#endif
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		subfunc = 3;	/*
+				 * 2 - activate SPU tracing,
+				 * 3 - deactivate
+				 */
+		lfsr_value = 0x8f100000;
+
+		rtn_value = rtas_call(spu_rtas_token, 3, 1, NULL,
+				      subfunc, cbe_cpu_to_node(cpu),
+				      lfsr_value);
+
+		if (unlikely(rtn_value != 0)) {
+			printk(KERN_ERR
+			       "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n",
+			       __FUNCTION__, rtn_value);
+		}
+
+		/* Deactivate the signals */
+		pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
+	}
+
+	stop_spu_profiling();
+}
+
+static void cell_global_stop_ppu(void)
 {
 	int cpu;
 
-	/* This routine will be called once for the system.
+	/*
+	 * This routine will be called once for the system.
 	 * There is one performance monitor per node, so we
 	 * only need to perform this function once per node.
 	 */
@@ -687,8 +1098,16 @@ static void cell_global_stop(void)
 	}
 }
 
-static void
-cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
+static void cell_global_stop(void)
+{
+	if (spu_cycle_reset)
+		cell_global_stop_spu();
+	else
+		cell_global_stop_ppu();
+}
+
+static void cell_handle_interrupt(struct pt_regs *regs,
+				struct op_counter_config *ctr)
 {
 	u32 cpu;
 	u64 pc;
@@ -699,13 +1118,15 @@ cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
 
 	cpu = smp_processor_id();
 
-	/* Need to make sure the interrupt handler and the virt counter
+	/*
+	 * Need to make sure the interrupt handler and the virt counter
 	 * routine are not running at the same time. See the
 	 * cell_virtual_cntr() routine for additional comments.
 	 */
 	spin_lock_irqsave(&virt_cntr_lock, flags);
 
-	/* Need to disable and reenable the performance counters
+	/*
+	 * Need to disable and reenable the performance counters
 	 * to get the desired behavior from the hardware.  This
 	 * is hardware specific.
 	 */
@@ -714,7 +1135,8 @@ cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
 
 	interrupt_mask = cbe_get_and_clear_pm_interrupts(cpu);
 
-	/* If the interrupt mask has been cleared, then the virt cntr
+	/*
+	 * If the interrupt mask has been cleared, then the virt cntr
 	 * has cleared the interrupt.  When the thread that generated
 	 * the interrupt is restored, the data count will be restored to
 	 * 0xffffff0 to cause the interrupt to be regenerated.
@@ -732,18 +1154,20 @@ cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
 			}
 		}
 
-		/* The counters were frozen by the interrupt.
+		/*
+		 * The counters were frozen by the interrupt.
 		 * Reenable the interrupt and restart the counters.
 		 * If there was a race between the interrupt handler and
-		 * the virtual counter routine.  The virutal counter
+		 * the virtual counter routine.	 The virutal counter
 		 * routine may have cleared the interrupts.  Hence must
 		 * use the virt_cntr_inter_mask to re-enable the interrupts.
 		 */
 		cbe_enable_pm_interrupts(cpu, hdw_thread,
 					 virt_cntr_inter_mask);
 
-		/* The writes to the various performance counters only writes
-		 * to a latch.  The new values (interrupt setting bits, reset
+		/*
+		 * The writes to the various performance counters only writes
+		 * to a latch.	The new values (interrupt setting bits, reset
 		 * counter value etc.) are not copied to the actual registers
 		 * until the performance monitor is enabled.  In order to get
 		 * this to work as desired, the permormance monitor needs to
@@ -755,10 +1179,33 @@ cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
 	spin_unlock_irqrestore(&virt_cntr_lock, flags);
 }
 
+/*
+ * This function is called from the generic OProfile
+ * driver.  When profiling PPUs, we need to do the
+ * generic sync start; otherwise, do spu_sync_start.
+ */
+static int cell_sync_start(void)
+{
+	if (spu_cycle_reset)
+		return spu_sync_start();
+	else
+		return DO_GENERIC_SYNC;
+}
+
+static int cell_sync_stop(void)
+{
+	if (spu_cycle_reset)
+		return spu_sync_stop();
+	else
+		return 1;
+}
+
 struct op_powerpc_model op_model_cell = {
 	.reg_setup = cell_reg_setup,
 	.cpu_setup = cell_cpu_setup,
 	.global_start = cell_global_start,
 	.global_stop = cell_global_stop,
+	.sync_start = cell_sync_start,
+	.sync_stop = cell_sync_stop,
 	.handle_interrupt = cell_handle_interrupt,
 };
diff --git a/arch/powerpc/oprofile/op_model_fsl_booke.c b/arch/powerpc/oprofile/op_model_fsl_booke.c
index 2267eb8c661b..183a28bb1812 100644
--- a/arch/powerpc/oprofile/op_model_fsl_booke.c
+++ b/arch/powerpc/oprofile/op_model_fsl_booke.c
@@ -244,7 +244,7 @@ static void dump_pmcs(void)
 			mfpmr(PMRN_PMLCA3), mfpmr(PMRN_PMLCB3));
 }
 
-static void fsl_booke_cpu_setup(struct op_counter_config *ctr)
+static int fsl_booke_cpu_setup(struct op_counter_config *ctr)
 {
 	int i;
 
@@ -258,9 +258,11 @@ static void fsl_booke_cpu_setup(struct op_counter_config *ctr)
 
 		set_pmc_user_kernel(i, ctr[i].user, ctr[i].kernel);
 	}
+
+	return 0;
 }
 
-static void fsl_booke_reg_setup(struct op_counter_config *ctr,
+static int fsl_booke_reg_setup(struct op_counter_config *ctr,
 			     struct op_system_config *sys,
 			     int num_ctrs)
 {
@@ -276,9 +278,10 @@ static void fsl_booke_reg_setup(struct op_counter_config *ctr,
 	for (i = 0; i < num_counters; ++i)
 		reset_value[i] = 0x80000000UL - ctr[i].count;
 
+	return 0;
 }
 
-static void fsl_booke_start(struct op_counter_config *ctr)
+static int fsl_booke_start(struct op_counter_config *ctr)
 {
 	int i;
 
@@ -308,6 +311,8 @@ static void fsl_booke_start(struct op_counter_config *ctr)
 
 	pr_debug("start on cpu %d, pmgc0 %x\n", smp_processor_id(),
 			mfpmr(PMRN_PMGC0));
+
+	return 0;
 }
 
 static void fsl_booke_stop(void)
diff --git a/arch/powerpc/oprofile/op_model_pa6t.c b/arch/powerpc/oprofile/op_model_pa6t.c
index e8a56b0adadc..c40de461fd4e 100644
--- a/arch/powerpc/oprofile/op_model_pa6t.c
+++ b/arch/powerpc/oprofile/op_model_pa6t.c
@@ -89,7 +89,7 @@ static inline void ctr_write(unsigned int i, u64 val)
 
 
 /* precompute the values to stuff in the hardware registers */
-static void pa6t_reg_setup(struct op_counter_config *ctr,
+static int pa6t_reg_setup(struct op_counter_config *ctr,
 			   struct op_system_config *sys,
 			   int num_ctrs)
 {
@@ -135,10 +135,12 @@ static void pa6t_reg_setup(struct op_counter_config *ctr,
 		pr_debug("reset_value for pmc%u inited to 0x%lx\n",
 				 pmc, reset_value[pmc]);
 	}
+
+	return 0;
 }
 
 /* configure registers on this cpu */
-static void pa6t_cpu_setup(struct op_counter_config *ctr)
+static int pa6t_cpu_setup(struct op_counter_config *ctr)
 {
 	u64 mmcr0 = mmcr0_val;
 	u64 mmcr1 = mmcr1_val;
@@ -154,9 +156,11 @@ static void pa6t_cpu_setup(struct op_counter_config *ctr)
 		mfspr(SPRN_PA6T_MMCR0));
 	pr_debug("setup on cpu %d, mmcr1 %016lx\n", smp_processor_id(),
 		mfspr(SPRN_PA6T_MMCR1));
+
+	return 0;
 }
 
-static void pa6t_start(struct op_counter_config *ctr)
+static int pa6t_start(struct op_counter_config *ctr)
 {
 	int i;
 
@@ -174,6 +178,8 @@ static void pa6t_start(struct op_counter_config *ctr)
 	oprofile_running = 1;
 
 	pr_debug("start on cpu %d, mmcr0 %lx\n", smp_processor_id(), mmcr0);
+
+	return 0;
 }
 
 static void pa6t_stop(void)
diff --git a/arch/powerpc/oprofile/op_model_power4.c b/arch/powerpc/oprofile/op_model_power4.c
index a7c206b665af..cddc250a6a5c 100644
--- a/arch/powerpc/oprofile/op_model_power4.c
+++ b/arch/powerpc/oprofile/op_model_power4.c
@@ -32,7 +32,7 @@ static u32 mmcr0_val;
 static u64 mmcr1_val;
 static u64 mmcra_val;
 
-static void power4_reg_setup(struct op_counter_config *ctr,
+static int power4_reg_setup(struct op_counter_config *ctr,
 			     struct op_system_config *sys,
 			     int num_ctrs)
 {
@@ -60,6 +60,8 @@ static void power4_reg_setup(struct op_counter_config *ctr,
 		mmcr0_val &= ~MMCR0_PROBLEM_DISABLE;
 	else
 		mmcr0_val |= MMCR0_PROBLEM_DISABLE;
+
+	return 0;
 }
 
 extern void ppc64_enable_pmcs(void);
@@ -84,7 +86,7 @@ static inline int mmcra_must_set_sample(void)
 	return 0;
 }
 
-static void power4_cpu_setup(struct op_counter_config *ctr)
+static int power4_cpu_setup(struct op_counter_config *ctr)
 {
 	unsigned int mmcr0 = mmcr0_val;
 	unsigned long mmcra = mmcra_val;
@@ -111,9 +113,11 @@ static void power4_cpu_setup(struct op_counter_config *ctr)
 	    mfspr(SPRN_MMCR1));
 	dbg("setup on cpu %d, mmcra %lx\n", smp_processor_id(),
 	    mfspr(SPRN_MMCRA));
+
+	return 0;
 }
 
-static void power4_start(struct op_counter_config *ctr)
+static int power4_start(struct op_counter_config *ctr)
 {
 	int i;
 	unsigned int mmcr0;
@@ -148,6 +152,7 @@ static void power4_start(struct op_counter_config *ctr)
 	oprofile_running = 1;
 
 	dbg("start on cpu %d, mmcr0 %x\n", smp_processor_id(), mmcr0);
+	return 0;
 }
 
 static void power4_stop(void)
diff --git a/arch/powerpc/oprofile/op_model_rs64.c b/arch/powerpc/oprofile/op_model_rs64.c
index c731acbfb2a5..a20afe45d936 100644
--- a/arch/powerpc/oprofile/op_model_rs64.c
+++ b/arch/powerpc/oprofile/op_model_rs64.c
@@ -88,7 +88,7 @@ static unsigned long reset_value[OP_MAX_COUNTER];
 
 static int num_counters;
 
-static void rs64_reg_setup(struct op_counter_config *ctr,
+static int rs64_reg_setup(struct op_counter_config *ctr,
 			   struct op_system_config *sys,
 			   int num_ctrs)
 {
@@ -100,9 +100,10 @@ static void rs64_reg_setup(struct op_counter_config *ctr,
 		reset_value[i] = 0x80000000UL - ctr[i].count;
 
 	/* XXX setup user and kernel profiling */
+	return 0;
 }
 
-static void rs64_cpu_setup(struct op_counter_config *ctr)
+static int rs64_cpu_setup(struct op_counter_config *ctr)
 {
 	unsigned int mmcr0;
 
@@ -125,9 +126,11 @@ static void rs64_cpu_setup(struct op_counter_config *ctr)
 	    mfspr(SPRN_MMCR0));
 	dbg("setup on cpu %d, mmcr1 %lx\n", smp_processor_id(),
 	    mfspr(SPRN_MMCR1));
+
+	return 0;
 }
 
-static void rs64_start(struct op_counter_config *ctr)
+static int rs64_start(struct op_counter_config *ctr)
 {
 	int i;
 	unsigned int mmcr0;
@@ -155,6 +158,7 @@ static void rs64_start(struct op_counter_config *ctr)
 	mtspr(SPRN_MMCR0, mmcr0);
 
 	dbg("start on cpu %d, mmcr0 %x\n", smp_processor_id(), mmcr0);
+	return 0;
 }
 
 static void rs64_stop(void)
diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c
index a7efb999d65e..6694f86d7000 100644
--- a/arch/powerpc/platforms/cell/spufs/context.c
+++ b/arch/powerpc/platforms/cell/spufs/context.c
@@ -22,6 +22,7 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <asm/atomic.h>
 #include <asm/spu.h>
@@ -81,6 +82,8 @@ void destroy_spu_context(struct kref *kref)
 	spu_fini_csa(&ctx->csa);
 	if (ctx->gang)
 		spu_gang_remove_ctx(ctx->gang, ctx);
+	if (ctx->prof_priv_kref)
+		kref_put(ctx->prof_priv_kref, ctx->prof_priv_release);
 	BUG_ON(!list_empty(&ctx->rq));
 	atomic_dec(&nr_spu_contexts);
 	kfree(ctx);
@@ -185,3 +188,20 @@ void spu_release_saved(struct spu_context *ctx)
 
 	spu_release(ctx);
 }
+
+void spu_set_profile_private_kref(struct spu_context *ctx,
+				  struct kref *prof_info_kref,
+				  void ( * prof_info_release) (struct kref *kref))
+{
+	ctx->prof_priv_kref = prof_info_kref;
+	ctx->prof_priv_release = prof_info_release;
+}
+EXPORT_SYMBOL_GPL(spu_set_profile_private_kref);
+
+void *spu_get_profile_private_kref(struct spu_context *ctx)
+{
+	return ctx->prof_priv_kref;
+}
+EXPORT_SYMBOL_GPL(spu_get_profile_private_kref);
+
+
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 88ec333e90d3..44e2338a05d5 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -274,6 +274,7 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
 	ctx->spu = spu;
 	ctx->ops = &spu_hw_ops;
 	spu->pid = current->pid;
+	spu->tgid = current->tgid;
 	spu_associate_mm(spu, ctx->owner);
 	spu->ibox_callback = spufs_ibox_callback;
 	spu->wbox_callback = spufs_wbox_callback;
@@ -456,6 +457,7 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
 	spu->dma_callback = NULL;
 	spu_associate_mm(spu, NULL);
 	spu->pid = 0;
+	spu->tgid = 0;
 	ctx->ops = &spu_backing_ops;
 	spu->flags = 0;
 	spu->ctx = NULL;
@@ -737,7 +739,7 @@ void spu_deactivate(struct spu_context *ctx)
 }
 
 /**
- * spu_yield -  yield a physical spu if others are waiting
+ * spu_yield -	yield a physical spu if others are waiting
  * @ctx:	spu context to yield
  *
  * Check if there is a higher priority context waiting and if yes
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index 692dbd0edc37..8b20c0c1556f 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -85,6 +85,8 @@ struct spu_context {
 
 	struct list_head gang_list;
 	struct spu_gang *gang;
+	struct kref *prof_priv_kref;
+	void ( * prof_priv_release) (struct kref *kref);
 
 	/* owner thread */
 	pid_t tid;
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index edd6de995726..8134c7e198a5 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -26,8 +26,9 @@
 #include <linux/profile.h>
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/oprofile.h>
 #include <linux/sched.h>
- 
+
 #include "oprofile_stats.h"
 #include "event_buffer.h"
 #include "cpu_buffer.h"
diff --git a/drivers/oprofile/event_buffer.h b/drivers/oprofile/event_buffer.h
index 9b6a4ebd03e3..5076ed1ebd8f 100644
--- a/drivers/oprofile/event_buffer.h
+++ b/drivers/oprofile/event_buffer.h
@@ -19,28 +19,10 @@ void free_event_buffer(void);
  
 /* wake up the process sleeping on the event file */
 void wake_up_buffer_waiter(void);
- 
-/* Each escaped entry is prefixed by ESCAPE_CODE
- * then one of the following codes, then the
- * relevant data.
- */
-#define ESCAPE_CODE			~0UL
-#define CTX_SWITCH_CODE 		1
-#define CPU_SWITCH_CODE 		2
-#define COOKIE_SWITCH_CODE 		3
-#define KERNEL_ENTER_SWITCH_CODE	4
-#define KERNEL_EXIT_SWITCH_CODE		5
-#define MODULE_LOADED_CODE		6
-#define CTX_TGID_CODE			7
-#define TRACE_BEGIN_CODE		8
-#define TRACE_END_CODE			9
- 
+
 #define INVALID_COOKIE ~0UL
 #define NO_COOKIE 0UL
 
-/* add data to the event buffer */
-void add_event_entry(unsigned long data);
- 
 extern const struct file_operations event_buffer_fops;
  
 /* mutex between sync_cpu_buffers() and the
diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c
index e5162a64018b..2c645170f06e 100644
--- a/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@ -53,9 +53,24 @@ int oprofile_setup(void)
 	 * us missing task deaths and eventually oopsing
 	 * when trying to process the event buffer.
 	 */
+	if (oprofile_ops.sync_start) {
+		int sync_ret = oprofile_ops.sync_start();
+		switch (sync_ret) {
+		case 0:
+			goto post_sync;
+		case 1:
+			goto do_generic;
+		case -1:
+			goto out3;
+		default:
+			goto out3;
+		}
+	}
+do_generic:
 	if ((err = sync_start()))
 		goto out3;
 
+post_sync:
 	is_setup = 1;
 	mutex_unlock(&start_mutex);
 	return 0;
@@ -118,7 +133,20 @@ out:
 void oprofile_shutdown(void)
 {
 	mutex_lock(&start_mutex);
+	if (oprofile_ops.sync_stop) {
+		int sync_ret = oprofile_ops.sync_stop();
+		switch (sync_ret) {
+		case 0:
+			goto post_sync;
+		case 1:
+			goto do_generic;
+		default:
+			goto post_sync;
+		}
+	}
+do_generic:
 	sync_stop();
+post_sync:
 	if (oprofile_ops.shutdown)
 		oprofile_ops.shutdown();
 	is_setup = 0;
diff --git a/include/asm-powerpc/oprofile_impl.h b/include/asm-powerpc/oprofile_impl.h
index 8d6b47f7b300..938fefb4c4bc 100644
--- a/include/asm-powerpc/oprofile_impl.h
+++ b/include/asm-powerpc/oprofile_impl.h
@@ -39,14 +39,16 @@ struct op_system_config {
 
 /* Per-arch configuration */
 struct op_powerpc_model {
-	void (*reg_setup) (struct op_counter_config *,
+	int (*reg_setup) (struct op_counter_config *,
 			   struct op_system_config *,
 			   int num_counters);
-	void (*cpu_setup) (struct op_counter_config *);
-	void (*start) (struct op_counter_config *);
-        void (*global_start) (struct op_counter_config *);
+	int  (*cpu_setup) (struct op_counter_config *);
+	int  (*start) (struct op_counter_config *);
+	int  (*global_start) (struct op_counter_config *);
 	void (*stop) (void);
 	void (*global_stop) (void);
+	int (*sync_start)(void);
+	int (*sync_stop)(void);
 	void (*handle_interrupt) (struct pt_regs *,
 				  struct op_counter_config *);
 	int num_counters;
diff --git a/include/asm-powerpc/spu.h b/include/asm-powerpc/spu.h
index 24f352da2869..a0f7fc8e23bb 100644
--- a/include/asm-powerpc/spu.h
+++ b/include/asm-powerpc/spu.h
@@ -138,6 +138,7 @@ struct spu {
 	struct spu_runqueue *rq;
 	unsigned long long timestamp;
 	pid_t pid;
+	pid_t tgid;
 	int class_0_pending;
 	spinlock_t register_lock;
 
@@ -217,6 +218,20 @@ extern void spu_associate_mm(struct spu *spu, struct mm_struct *mm);
 struct mm_struct;
 extern void spu_flush_all_slbs(struct mm_struct *mm);
 
+/* This interface allows a profiler (e.g., OProfile) to store a ref
+ * to spu context information that it creates.	This caching technique
+ * avoids the need to recreate this information after a save/restore operation.
+ *
+ * Assumes the caller has already incremented the ref count to
+ * profile_info; then spu_context_destroy must call kref_put
+ * on prof_info_kref.
+ */
+void spu_set_profile_private_kref(struct spu_context *ctx,
+				  struct kref *prof_info_kref,
+				  void ( * prof_info_release) (struct kref *kref));
+
+void *spu_get_profile_private_kref(struct spu_context *ctx);
+
 /* system callbacks from the SPU */
 struct spu_syscall_block {
 	u64 nr_ret;
diff --git a/include/linux/dcookies.h b/include/linux/dcookies.h
index 0fe7cdf326f7..98c69ab80c84 100644
--- a/include/linux/dcookies.h
+++ b/include/linux/dcookies.h
@@ -12,6 +12,7 @@
 
 #ifdef CONFIG_PROFILING
  
+#include <linux/dcache.h>
 #include <linux/types.h>
  
 struct dcookie_user;
diff --git a/include/linux/elf-em.h b/include/linux/elf-em.h
index 0311bad838b1..5834e843a946 100644
--- a/include/linux/elf-em.h
+++ b/include/linux/elf-em.h
@@ -20,7 +20,8 @@
 #define EM_PARISC	15	/* HPPA */
 #define EM_SPARC32PLUS	18	/* Sun's "v8plus" */
 #define EM_PPC		20	/* PowerPC */
-#define EM_PPC64	21       /* PowerPC64 */
+#define EM_PPC64	21	 /* PowerPC64 */
+#define EM_SPU		23	/* Cell BE SPU */
 #define EM_SH		42	/* SuperH */
 #define EM_SPARCV9	43	/* SPARC v9 64-bit */
 #define EM_IA_64	50	/* HP/Intel IA-64 */
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h
index 0d514b252454..041bb31100f4 100644
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -17,6 +17,26 @@
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
  
+/* Each escaped entry is prefixed by ESCAPE_CODE
+ * then one of the following codes, then the
+ * relevant data.
+ * These #defines live in this file so that arch-specific
+ * buffer sync'ing code can access them.
+ */
+#define ESCAPE_CODE			~0UL
+#define CTX_SWITCH_CODE			1
+#define CPU_SWITCH_CODE			2
+#define COOKIE_SWITCH_CODE		3
+#define KERNEL_ENTER_SWITCH_CODE	4
+#define KERNEL_EXIT_SWITCH_CODE		5
+#define MODULE_LOADED_CODE		6
+#define CTX_TGID_CODE			7
+#define TRACE_BEGIN_CODE		8
+#define TRACE_END_CODE			9
+#define XEN_ENTER_SWITCH_CODE		10
+#define SPU_PROFILING_CODE		11
+#define SPU_CTX_SWITCH_CODE		12
+
 struct super_block;
 struct dentry;
 struct file_operations;
@@ -35,6 +55,14 @@ struct oprofile_operations {
 	int (*start)(void);
 	/* Stop delivering interrupts. */
 	void (*stop)(void);
+	/* Arch-specific buffer sync functions.
+	 * Return value = 0:  Success
+	 * Return value = -1: Failure
+	 * Return value = 1:  Run generic sync function
+	 */
+	int (*sync_start)(void);
+	int (*sync_stop)(void);
+
 	/* Initiate a stack backtrace. Optional. */
 	void (*backtrace)(struct pt_regs * const regs, unsigned int depth);
 	/* CPU identification string. */
@@ -55,6 +83,13 @@ int oprofile_arch_init(struct oprofile_operations * ops);
  */
 void oprofile_arch_exit(void);
 
+/**
+ * Add data to the event buffer.
+ * The data passed is free-form, but typically consists of
+ * file offsets, dcookies, context information, and ESCAPE codes.
+ */
+void add_event_entry(unsigned long data);
+
 /**
  * Add a sample. This may be called from any context. Pass
  * smp_processor_id() as cpu.
-- 
cgit v1.3-8-gc7d7


From 3d58ffe2aa107df6db57f875dba5368960b17cde Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Wed, 18 Jul 2007 18:41:08 -0300
Subject: V4L/DVB (5867): videodev2.h: add missing <sys/time.h> for userspace

When videodev2.h is included by an application, it needs to include
<sys/time.h> for the timeval struct.

Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 include/linux/videodev2.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index d16a2b57dc81..c66c8a3410b9 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -60,6 +60,7 @@
 #include <linux/compiler.h> /* need __user */
 #else
 #define __user
+#include <sys/time.h>
 #endif
 #include <linux/types.h>
 
-- 
cgit v1.3-8-gc7d7


From 31ce72a6b1c7635259cf522459539c0611f2c50c Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 20 Jul 2007 19:45:45 -0700
Subject: [NET]: Fix loopback crashes when multiqueue is enabled.

From: Patrick McHardy <kaber@trash.net>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 net/core/dev.c            | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9820ca1e45e2..4a616d73cc25 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -575,7 +575,7 @@ struct net_device
 
 	/* The TX queue control structures */
 	unsigned int			egress_subqueue_count;
-	struct net_device_subqueue	egress_subqueue[0];
+	struct net_device_subqueue	egress_subqueue[1];
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 38212c3f9971..ee4035571c21 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3624,7 +3624,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	/* ensure 32-byte alignment of both the device and private area */
 	alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST +
-		     (sizeof(struct net_device_subqueue) * queue_count)) &
+		     (sizeof(struct net_device_subqueue) * (queue_count - 1))) &
 		     ~NETDEV_ALIGN_CONST;
 	alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
 
@@ -3642,7 +3642,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 		dev->priv = ((char *)dev +
 			     ((sizeof(struct net_device) +
 			       (sizeof(struct net_device_subqueue) *
-				queue_count) + NETDEV_ALIGN_CONST)
+				(queue_count - 1)) + NETDEV_ALIGN_CONST)
 			      & ~NETDEV_ALIGN_CONST));
 	}
 
-- 
cgit v1.3-8-gc7d7


From 39dca558a5b52b63e49bc234a7e887be092aa690 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@steeleye.com>
Date: Fri, 20 Jul 2007 18:22:17 -0500
Subject: [SCSI] bsg: make class backlinks

Currently, bsg doesn't make class backlinks (a process whereby you'd get
a link to bsg in the device directory in the same way you get one for
sg).  This is because the bsg device is uninitialised, so the class
device has nothing it can attach to.  The fix is to make the bsg device
point to the cdevice of the entity creating the bsg, necessitating
changing the bsg_register_queue() prototype into a form that takes the
generic device.

Acked-by: FUJITA Tomonori <tomof@acm.org>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 block/bsg.c                       | 21 +++++++++++++++++----
 drivers/scsi/scsi_sysfs.c         |  2 +-
 drivers/scsi/scsi_transport_sas.c | 32 ++++++++++++++++++--------------
 include/linux/bsg.h               |  4 ++--
 4 files changed, 38 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/block/bsg.c b/block/bsg.c
index 0e3d5d490d20..4eebcd5c7311 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -936,20 +936,29 @@ void bsg_unregister_queue(struct request_queue *q)
 
 	mutex_lock(&bsg_mutex);
 	sysfs_remove_link(&q->kobj, "bsg");
-	class_device_destroy(bsg_class, MKDEV(bsg_major, bcd->minor));
+	class_device_unregister(bcd->class_dev);
+	put_device(bcd->dev);
 	bcd->class_dev = NULL;
+	bcd->dev = NULL;
 	list_del_init(&bcd->list);
 	bsg_device_nr--;
 	mutex_unlock(&bsg_mutex);
 }
 EXPORT_SYMBOL_GPL(bsg_unregister_queue);
 
-int bsg_register_queue(struct request_queue *q, const char *name)
+int bsg_register_queue(struct request_queue *q, struct device *gdev,
+		       const char *name)
 {
 	struct bsg_class_device *bcd, *__bcd;
 	dev_t dev;
 	int ret = -EMFILE;
 	struct class_device *class_dev = NULL;
+	const char *devname;
+
+	if (name)
+		devname = name;
+	else
+		devname = gdev->bus_id;
 
 	/*
 	 * we need a proper transport to send commands, not a stacked device
@@ -982,11 +991,13 @@ retry:
 		bsg_minor_idx = 0;
 
 	bcd->queue = q;
+	bcd->dev = get_device(gdev);
 	dev = MKDEV(bsg_major, bcd->minor);
-	class_dev = class_device_create(bsg_class, NULL, dev, bcd->dev, "%s", name);
+	class_dev = class_device_create(bsg_class, NULL, dev, gdev, "%s",
+					devname);
 	if (IS_ERR(class_dev)) {
 		ret = PTR_ERR(class_dev);
-		goto err;
+		goto err_put;
 	}
 	bcd->class_dev = class_dev;
 
@@ -1004,6 +1015,8 @@ retry:
 
 err_unregister:
 	class_device_unregister(class_dev);
+err_put:
+	put_device(gdev);
 err:
 	mutex_unlock(&bsg_mutex);
 	return ret;
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index ad5f21fd5d45..34cdce6738a6 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -736,7 +736,7 @@ int scsi_sysfs_add_sdev(struct scsi_device *sdev)
 	 * released by the sdev_class .release */
 	get_device(&sdev->sdev_gendev);
 
-	error = bsg_register_queue(rq, sdev->sdev_gendev.bus_id);
+	error = bsg_register_queue(rq, &sdev->sdev_gendev, NULL);
 
 	if (error)
 		sdev_printk(KERN_INFO, sdev,
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 573f588154d0..3120f4b3a11a 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -191,25 +191,34 @@ static void sas_non_host_smp_request(struct request_queue *q)
 	sas_smp_request(q, rphy_to_shost(rphy), rphy);
 }
 
-static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy,
-			      char *name)
+static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
 {
 	struct request_queue *q;
 	int error;
+	struct device *dev;
+	char namebuf[BUS_ID_SIZE];
+	const char *name;
 
 	if (!to_sas_internal(shost->transportt)->f->smp_handler) {
 		printk("%s can't handle SMP requests\n", shost->hostt->name);
 		return 0;
 	}
 
-	if (rphy)
+	if (rphy) {
 		q = blk_init_queue(sas_non_host_smp_request, NULL);
-	else
+		dev = &rphy->dev;
+		name = dev->bus_id;
+	} else {
 		q = blk_init_queue(sas_host_smp_request, NULL);
+		dev = &shost->shost_gendev;
+		snprintf(namebuf, sizeof(namebuf),
+			 "sas_host%d", shost->host_no);
+		name = namebuf;
+	}
 	if (!q)
 		return -ENOMEM;
 
-	error = bsg_register_queue(q, name);
+	error = bsg_register_queue(q, dev, name);
 	if (error) {
 		blk_cleanup_queue(q);
 		return -ENOMEM;
@@ -255,7 +264,6 @@ static int sas_host_setup(struct transport_container *tc, struct device *dev,
 {
 	struct Scsi_Host *shost = dev_to_shost(dev);
 	struct sas_host_attrs *sas_host = to_sas_host_attrs(shost);
-	char name[BUS_ID_SIZE];
 
 	INIT_LIST_HEAD(&sas_host->rphy_list);
 	mutex_init(&sas_host->lock);
@@ -263,8 +271,7 @@ static int sas_host_setup(struct transport_container *tc, struct device *dev,
 	sas_host->next_expander_id = 0;
 	sas_host->next_port_id = 0;
 
-	snprintf(name, sizeof(name), "sas_host%d", shost->host_no);
-	if (sas_bsg_initialize(shost, NULL, name))
+	if (sas_bsg_initialize(shost, NULL))
 		dev_printk(KERN_ERR, dev, "fail to a bsg device %d\n",
 			   shost->host_no);
 
@@ -1332,9 +1339,6 @@ struct sas_rphy *sas_end_device_alloc(struct sas_port *parent)
 	sas_rphy_initialize(&rdev->rphy);
 	transport_setup_device(&rdev->rphy.dev);
 
-	if (sas_bsg_initialize(shost, &rdev->rphy, rdev->rphy.dev.bus_id))
-		printk("fail to a bsg device %s\n", rdev->rphy.dev.bus_id);
-
 	return &rdev->rphy;
 }
 EXPORT_SYMBOL(sas_end_device_alloc);
@@ -1374,9 +1378,6 @@ struct sas_rphy *sas_expander_alloc(struct sas_port *parent,
 	sas_rphy_initialize(&rdev->rphy);
 	transport_setup_device(&rdev->rphy.dev);
 
-	if (sas_bsg_initialize(shost, &rdev->rphy, rdev->rphy.dev.bus_id))
-		printk("fail to a bsg device %s\n", rdev->rphy.dev.bus_id);
-
 	return &rdev->rphy;
 }
 EXPORT_SYMBOL(sas_expander_alloc);
@@ -1404,6 +1405,9 @@ int sas_rphy_add(struct sas_rphy *rphy)
 		return error;
 	transport_add_device(&rphy->dev);
 	transport_configure_device(&rphy->dev);
+	if (sas_bsg_initialize(shost, rphy))
+		printk("fail to a bsg device %s\n", rphy->dev.bus_id);
+
 
 	mutex_lock(&sas_host->lock);
 	list_add_tail(&rphy->list, &sas_host->rphy_list);
diff --git a/include/linux/bsg.h b/include/linux/bsg.h
index 8547b10c388b..f415f89e0ac8 100644
--- a/include/linux/bsg.h
+++ b/include/linux/bsg.h
@@ -57,10 +57,10 @@ struct bsg_class_device {
 	struct request_queue *queue;
 };
 
-extern int bsg_register_queue(struct request_queue *, const char *);
+extern int bsg_register_queue(struct request_queue *, struct device *, const char *);
 extern void bsg_unregister_queue(struct request_queue *);
 #else
-#define bsg_register_queue(disk, name)		(0)
+#define bsg_register_queue(disk, dev, name)		(0)
 #define bsg_unregister_queue(disk)	do { } while (0)
 #endif
 
-- 
cgit v1.3-8-gc7d7


From d3fec424b23c47686efcf3f2004c3f1c1cee4d9c Mon Sep 17 00:00:00 2001
From: Jan Harkes <jaharkes@cs.cmu.edu>
Date: Sat, 21 Jul 2007 04:37:26 -0700
Subject: coda: remove CODA_STORE/CODA_RELEASE upcalls

This is an variation on the patch sent by Christoph Hellwig which kills
file_count abuse by the Coda kernel module by moving the coda_flush
functionality into coda_release.  However part of reason we were using the
coda_flush callback was to allow Coda to pass errors that occur during
writeback from the userspace cache manager back to close().

As Al Viro explained on linux-fsdevel, it is impossible to guarantee that
such errors can in fact be returned back to the caller.  There are many
cases where the last reference to a file is not released by the close
system call and it is also impossible to pick some close as a 'last-close'
and delay it until all other references have been destroyed.

The CODA_STORE/CODA_RELEASE upcall combination is clearly a broken design,
and it is better to remove it completely.

Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@ftp.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/dir.c              |  1 -
 fs/coda/file.c             | 65 +++++-----------------------------------------
 fs/coda/upcall.c           | 49 +---------------------------------
 include/linux/coda_linux.h |  1 -
 include/linux/coda_psdev.h |  3 ---
 5 files changed, 7 insertions(+), 112 deletions(-)

(limited to 'include/linux')

diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 8e61236abf4a..f89ff083079b 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -86,7 +86,6 @@ const struct file_operations coda_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= coda_readdir,
 	.open		= coda_open,
-	.flush		= coda_flush,
 	.release	= coda_release,
 	.fsync		= coda_fsync,
 };
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 7594962604c2..29137ff3ca67 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -25,10 +25,6 @@
 
 #include "coda_int.h"
 
-/* if CODA_STORE fails with EOPNOTSUPP, venus clearly doesn't support
- * CODA_STORE/CODA_RELEASE and we fall back on using the CODA_CLOSE upcall */
-static int use_coda_close;
-
 static ssize_t
 coda_file_read(struct file *coda_file, char __user *buf, size_t count, loff_t *ppos)
 {
@@ -163,47 +159,6 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
 	return 0;
 }
 
-int coda_flush(struct file *coda_file, fl_owner_t id)
-{
-	unsigned short flags = coda_file->f_flags & ~O_EXCL;
-	unsigned short coda_flags = coda_flags_to_cflags(flags);
-	struct coda_file_info *cfi;
-	struct inode *coda_inode;
-	int err = 0, fcnt;
-
-	lock_kernel();
-
-	/* last close semantics */
-	fcnt = file_count(coda_file);
-	if (fcnt > 1)
-		goto out;
-
-	/* No need to make an upcall when we have not made any modifications
-	 * to the file */
-	if ((coda_file->f_flags & O_ACCMODE) == O_RDONLY)
-		goto out;
-
-	if (use_coda_close)
-		goto out;
-
-	cfi = CODA_FTOC(coda_file);
-	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
-
-	coda_inode = coda_file->f_path.dentry->d_inode;
-
-	err = venus_store(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags,
-			  coda_file->f_uid);
-
-	if (err == -EOPNOTSUPP) {
-		use_coda_close = 1;
-		err = 0;
-	}
-
-out:
-	unlock_kernel();
-	return err;
-}
-
 int coda_release(struct inode *coda_inode, struct file *coda_file)
 {
 	unsigned short flags = (coda_file->f_flags) & (~O_EXCL);
@@ -215,21 +170,11 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 
 	lock_kernel();
 
-	if (!use_coda_close) {
-		err = venus_release(coda_inode->i_sb, coda_i2f(coda_inode),
-				    coda_flags);
-		if (err == -EOPNOTSUPP) {
-			use_coda_close = 1;
-			err = 0;
-		}
-	}
-
 	cfi = CODA_FTOC(coda_file);
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 
-	if (use_coda_close)
-		err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode),
-				  coda_flags, coda_file->f_uid);
+	err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode),
+			  coda_flags, coda_file->f_uid);
 
 	host_inode = cfi->cfi_container->f_path.dentry->d_inode;
 	cii = ITOC(coda_inode);
@@ -246,7 +191,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 	coda_file->private_data = NULL;
 
 	unlock_kernel();
-	return err;
+
+	/* VFS fput ignores the return value from file_operations->release, so
+	 * there is no use returning an error here */
+	return 0;
 }
 
 int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
@@ -288,7 +236,6 @@ const struct file_operations coda_file_operations = {
 	.write		= coda_file_write,
 	.mmap		= coda_file_mmap,
 	.open		= coda_open,
-	.flush		= coda_flush,
 	.release	= coda_release,
 	.fsync		= coda_fsync,
 	.splice_read	= coda_file_splice_read,
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index cd561d2e90b0..cdb4c07a7870 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -160,55 +160,8 @@ int venus_lookup(struct super_block *sb, struct CodaFid *fid,
 	return error;
 }
 
-int venus_store(struct super_block *sb, struct CodaFid *fid, int flags,
-                vuid_t uid)
-{
-        union inputArgs *inp;
-        union outputArgs *outp;
-        int insize, outsize, error;
-#ifdef CONFIG_CODA_FS_OLD_API
-	struct coda_cred cred = { 0, };
-	cred.cr_fsuid = uid;
-#endif
-	
-	insize = SIZE(store);
-	UPARG(CODA_STORE);
-	
-#ifdef CONFIG_CODA_FS_OLD_API
-	memcpy(&(inp->ih.cred), &cred, sizeof(cred));
-#else
-	inp->ih.uid = uid;
-#endif
-	
-        inp->coda_store.VFid = *fid;
-        inp->coda_store.flags = flags;
-
-	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
-
-	CODA_FREE(inp, insize);
-        return error;
-}
-
-int venus_release(struct super_block *sb, struct CodaFid *fid, int flags)
-{
-        union inputArgs *inp;
-        union outputArgs *outp;
-        int insize, outsize, error;
-	
-	insize = SIZE(release);
-	UPARG(CODA_RELEASE);
-	
-	inp->coda_release.VFid = *fid;
-	inp->coda_release.flags = flags;
-
-	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
-
-	CODA_FREE(inp, insize);
-	return error;
-}
-
 int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
-                vuid_t uid)
+		vuid_t uid)
 {
 	union inputArgs *inp;
 	union outputArgs *outp;
diff --git a/include/linux/coda_linux.h b/include/linux/coda_linux.h
index c4079b403e9e..1c47a34aa794 100644
--- a/include/linux/coda_linux.h
+++ b/include/linux/coda_linux.h
@@ -36,7 +36,6 @@ extern const struct file_operations coda_ioctl_operations;
 
 /* operations shared over more than one file */
 int coda_open(struct inode *i, struct file *f);
-int coda_flush(struct file *f, fl_owner_t id);
 int coda_release(struct inode *i, struct file *f);
 int coda_permission(struct inode *inode, int mask, struct nameidata *nd);
 int coda_revalidate_inode(struct dentry *);
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index aa8f454b3b77..07ae8f846055 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -33,9 +33,6 @@ int venus_setattr(struct super_block *, struct CodaFid *, struct coda_vattr *);
 int venus_lookup(struct super_block *sb, struct CodaFid *fid, 
 		 const char *name, int length, int *type, 
 		 struct CodaFid *resfid);
-int venus_store(struct super_block *sb, struct CodaFid *fid, int flags,
-		vuid_t uid);
-int venus_release(struct super_block *sb, struct CodaFid *fid, int flags);
 int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
 		vuid_t uid);
 int venus_open(struct super_block *sb, struct CodaFid *fid, int flags,
-- 
cgit v1.3-8-gc7d7


From 93da56efcf8c6a111f0349f6b7651172d4745ca0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 21 Jul 2007 04:37:33 -0700
Subject: clockevents: remove prototypes of removed functions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/clockchips.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 8486e78f7335..8d7a39019ace 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -119,10 +119,6 @@ extern void clockevents_register_device(struct clock_event_device *dev);
 
 extern void clockevents_exchange_device(struct clock_event_device *old,
 					struct clock_event_device *new);
-extern
-struct clock_event_device *clockevents_request_device(unsigned int features,
-						      cpumask_t cpumask);
-extern void clockevents_release_device(struct clock_event_device *dev);
 extern void clockevents_set_mode(struct clock_event_device *dev,
 				 enum clock_event_mode mode);
 extern int clockevents_register_notifier(struct notifier_block *nb);
-- 
cgit v1.3-8-gc7d7


From 18de5bc4c1f1f1fa5e14f354a7603bd6e9d4e3b6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 21 Jul 2007 04:37:34 -0700
Subject: clockevents: fix resume logic

We need to make sure, that the clockevent devices are resumed, before
the tick is resumed. The current resume logic does not guarantee this.

Add CLOCK_EVT_MODE_RESUME and call the set mode functions of the clock
event devices before resuming the tick / oneshot functionality.

Fixup the existing users.

Thanks to Nigel Cunningham for tracking down a long standing thinko,
which affected the jinxed VAIO.

[akpm@linux-foundation.org: xen build fix]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mach-davinci/time.c      |  2 ++
 arch/arm/mach-imx/time.c          |  1 +
 arch/arm/mach-ixp4xx/common.c     |  2 ++
 arch/arm/mach-omap1/time.c        |  1 +
 arch/arm/plat-omap/timer32k.c     |  2 ++
 arch/i386/kernel/apic.c           |  3 ++
 arch/i386/kernel/hpet.c           | 71 +++------------------------------------
 arch/i386/kernel/i8253.c          | 26 +++++++-------
 arch/i386/kernel/vmiclock.c       |  1 +
 arch/i386/xen/time.c              |  3 ++
 arch/sh/kernel/timers/timer-tmu.c |  1 +
 arch/sparc64/kernel/time.c        |  1 +
 drivers/lguest/lguest.c           |  2 ++
 include/linux/clockchips.h        |  1 +
 kernel/time/tick-broadcast.c      |  6 ++--
 kernel/time/tick-common.c         | 16 +++++----
 16 files changed, 51 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-davinci/time.c b/arch/arm/mach-davinci/time.c
index 4d8425de6922..e96a3dcdc1a7 100644
--- a/arch/arm/mach-davinci/time.c
+++ b/arch/arm/mach-davinci/time.c
@@ -285,6 +285,8 @@ static void davinci_set_mode(enum clock_event_mode mode,
 	case CLOCK_EVT_MODE_SHUTDOWN:
 		t->opts = TIMER_OPTS_DISABLED;
 		break;
+	case CLOCK_EVT_MODE_RESUME:
+		break;
 	}
 }
 
diff --git a/arch/arm/mach-imx/time.c b/arch/arm/mach-imx/time.c
index 010f6fa984a6..d86d124aea22 100644
--- a/arch/arm/mach-imx/time.c
+++ b/arch/arm/mach-imx/time.c
@@ -159,6 +159,7 @@ static void imx_set_mode(enum clock_event_mode mode, struct clock_event_device *
 		break;
 	case CLOCK_EVT_MODE_SHUTDOWN:
 	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_RESUME:
 		/* Left event sources disabled, no more interrupts appears */
 		break;
 	}
diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c
index 8112f726ffa0..23e7fba6d3e1 100644
--- a/arch/arm/mach-ixp4xx/common.c
+++ b/arch/arm/mach-ixp4xx/common.c
@@ -459,6 +459,8 @@ static void ixp4xx_set_mode(enum clock_event_mode mode,
 	default:
 		osrt = opts = 0;
 		break;
+	case CLOCK_EVT_MODE_RESUME:
+		break;
 	}
 
 	*IXP4XX_OSRT1 = osrt | opts;
diff --git a/arch/arm/mach-omap1/time.c b/arch/arm/mach-omap1/time.c
index 3705d20c4e5c..237651ebae5d 100644
--- a/arch/arm/mach-omap1/time.c
+++ b/arch/arm/mach-omap1/time.c
@@ -156,6 +156,7 @@ static void omap_mpu_set_mode(enum clock_event_mode mode,
 		break;
 	case CLOCK_EVT_MODE_UNUSED:
 	case CLOCK_EVT_MODE_SHUTDOWN:
+	case CLOCK_EVT_MODE_RESUME:
 		break;
 	}
 }
diff --git a/arch/arm/plat-omap/timer32k.c b/arch/arm/plat-omap/timer32k.c
index 2feceec8eccd..b0af014b0e2c 100644
--- a/arch/arm/plat-omap/timer32k.c
+++ b/arch/arm/plat-omap/timer32k.c
@@ -156,6 +156,8 @@ static void omap_32k_timer_set_mode(enum clock_event_mode mode,
 	case CLOCK_EVT_MODE_SHUTDOWN:
 		omap_32k_timer_stop();
 		break;
+	case CLOCK_EVT_MODE_RESUME:
+		break;
 	}
 }
 
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 67824f3bb974..610f44b24367 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -263,6 +263,9 @@ static void lapic_timer_setup(enum clock_event_mode mode,
 		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
 		apic_write_around(APIC_LVTT, v);
 		break;
+	case CLOCK_EVT_MODE_RESUME:
+		/* Nothing to do here */
+		break;
 	}
 
 	local_irq_restore(flags);
diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
index 17d73459fc5f..cfbf792a0bf4 100644
--- a/arch/i386/kernel/hpet.c
+++ b/arch/i386/kernel/hpet.c
@@ -187,6 +187,10 @@ static void hpet_set_mode(enum clock_event_mode mode,
 		cfg &= ~HPET_TN_ENABLE;
 		hpet_writel(cfg, HPET_T0_CFG);
 		break;
+
+	case CLOCK_EVT_MODE_RESUME:
+		hpet_enable_int();
+		break;
 	}
 }
 
@@ -217,6 +221,7 @@ static struct clocksource clocksource_hpet = {
 	.mask		= HPET_MASK,
 	.shift		= HPET_SHIFT,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
+	.resume		= hpet_start_counter,
 };
 
 /*
@@ -291,7 +296,6 @@ int __init hpet_enable(void)
 
 	clocksource_register(&clocksource_hpet);
 
-
 	if (id & HPET_ID_LEGSUP) {
 		hpet_enable_int();
 		hpet_reserve_platform_timers(id);
@@ -524,68 +528,3 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 #endif
-
-
-/*
- * Suspend/resume part
- */
-
-#ifdef CONFIG_PM
-
-static int hpet_suspend(struct sys_device *sys_device, pm_message_t state)
-{
-	unsigned long cfg = hpet_readl(HPET_CFG);
-
-	cfg &= ~(HPET_CFG_ENABLE|HPET_CFG_LEGACY);
-	hpet_writel(cfg, HPET_CFG);
-
-	return 0;
-}
-
-static int hpet_resume(struct sys_device *sys_device)
-{
-	unsigned int id;
-
-	hpet_start_counter();
-
-	id = hpet_readl(HPET_ID);
-
-	if (id & HPET_ID_LEGSUP)
-		hpet_enable_int();
-
-	return 0;
-}
-
-static struct sysdev_class hpet_class = {
-	set_kset_name("hpet"),
-	.suspend	= hpet_suspend,
-	.resume		= hpet_resume,
-};
-
-static struct sys_device hpet_device = {
-	.id		= 0,
-	.cls		= &hpet_class,
-};
-
-
-static __init int hpet_register_sysfs(void)
-{
-	int err;
-
-	if (!is_hpet_capable())
-		return 0;
-
-	err = sysdev_class_register(&hpet_class);
-
-	if (!err) {
-		err = sysdev_register(&hpet_device);
-		if (err)
-			sysdev_class_unregister(&hpet_class);
-	}
-
-	return err;
-}
-
-device_initcall(hpet_register_sysfs);
-
-#endif
diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c
index f8a3c4054c70..931eabe1e560 100644
--- a/arch/i386/kernel/i8253.c
+++ b/arch/i386/kernel/i8253.c
@@ -3,11 +3,11 @@
  *
  */
 #include <linux/clockchips.h>
-#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/jiffies.h>
-#include <linux/sysdev.h>
 #include <linux/module.h>
-#include <linux/init.h>
+#include <linux/spinlock.h>
 
 #include <asm/smp.h>
 #include <asm/delay.h>
@@ -41,26 +41,24 @@ static void init_pit_timer(enum clock_event_mode mode,
 	case CLOCK_EVT_MODE_PERIODIC:
 		/* binary, mode 2, LSB/MSB, ch 0 */
 		outb_p(0x34, PIT_MODE);
-		udelay(10);
 		outb_p(LATCH & 0xff , PIT_CH0);	/* LSB */
-		udelay(10);
 		outb(LATCH >> 8 , PIT_CH0);	/* MSB */
 		break;
 
-	/*
-	 * Avoid unnecessary state transitions, as it confuses
-	 * Geode / Cyrix based boxen.
-	 */
 	case CLOCK_EVT_MODE_SHUTDOWN:
-		if (evt->mode == CLOCK_EVT_MODE_UNUSED)
-			break;
 	case CLOCK_EVT_MODE_UNUSED:
-		if (evt->mode == CLOCK_EVT_MODE_SHUTDOWN)
-			break;
+		outb_p(0x30, PIT_MODE);
+		outb_p(0, PIT_CH0);	/* LSB */
+		outb_p(0, PIT_CH0);	/* MSB */
+		break;
+
 	case CLOCK_EVT_MODE_ONESHOT:
 		/* One shot setup */
 		outb_p(0x38, PIT_MODE);
-		udelay(10);
+		break;
+
+	case CLOCK_EVT_MODE_RESUME:
+		/* Nothing to do here */
 		break;
 	}
 	spin_unlock_irqrestore(&i8253_lock, flags);
diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c
index f9b845f4e692..d23077cca45b 100644
--- a/arch/i386/kernel/vmiclock.c
+++ b/arch/i386/kernel/vmiclock.c
@@ -142,6 +142,7 @@ static void vmi_timer_set_mode(enum clock_event_mode mode,
 
 	switch (mode) {
 	case CLOCK_EVT_MODE_ONESHOT:
+	case CLOCK_EVT_MODE_RESUME:
 		break;
 	case CLOCK_EVT_MODE_PERIODIC:
 		cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c
index 51fdabf1fd4d..dfd6db69ead5 100644
--- a/arch/i386/xen/time.c
+++ b/arch/i386/xen/time.c
@@ -412,6 +412,7 @@ static void xen_timerop_set_mode(enum clock_event_mode mode,
 		break;
 
 	case CLOCK_EVT_MODE_ONESHOT:
+	case CLOCK_EVT_MODE_RESUME:
 		break;
 
 	case CLOCK_EVT_MODE_UNUSED:
@@ -474,6 +475,8 @@ static void xen_vcpuop_set_mode(enum clock_event_mode mode,
 		    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
 			BUG();
 		break;
+	case CLOCK_EVT_MODE_RESUME:
+		break;
 	}
 }
 
diff --git a/arch/sh/kernel/timers/timer-tmu.c b/arch/sh/kernel/timers/timer-tmu.c
index 097ebd49f1bf..7aca37d79766 100644
--- a/arch/sh/kernel/timers/timer-tmu.c
+++ b/arch/sh/kernel/timers/timer-tmu.c
@@ -80,6 +80,7 @@ static void tmu_set_mode(enum clock_event_mode mode,
 		break;
 	case CLOCK_EVT_MODE_UNUSED:
 	case CLOCK_EVT_MODE_SHUTDOWN:
+	case CLOCK_EVT_MODE_RESUME:
 		break;
 	}
 }
diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c
index e340eb401fb9..87c10a7544da 100644
--- a/arch/sparc64/kernel/time.c
+++ b/arch/sparc64/kernel/time.c
@@ -931,6 +931,7 @@ static void sparc64_timer_setup(enum clock_event_mode mode,
 {
 	switch (mode) {
 	case CLOCK_EVT_MODE_ONESHOT:
+	case CLOCK_EVT_MODE_RESUME:
 		break;
 
 	case CLOCK_EVT_MODE_SHUTDOWN:
diff --git a/drivers/lguest/lguest.c b/drivers/lguest/lguest.c
index 434fea1e82f7..18dade06d4a9 100644
--- a/drivers/lguest/lguest.c
+++ b/drivers/lguest/lguest.c
@@ -398,6 +398,8 @@ static void lguest_clockevent_set_mode(enum clock_event_mode mode,
 		break;
 	case CLOCK_EVT_MODE_PERIODIC:
 		BUG();
+	case CLOCK_EVT_MODE_RESUME:
+		break;
 	}
 }
 
diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 8d7a39019ace..e0bd46eb2414 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -23,6 +23,7 @@ enum clock_event_mode {
 	CLOCK_EVT_MODE_SHUTDOWN,
 	CLOCK_EVT_MODE_PERIODIC,
 	CLOCK_EVT_MODE_ONESHOT,
+	CLOCK_EVT_MODE_RESUME,
 };
 
 /* Clock event notification values */
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 8001d37071f5..8339af229cb9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -49,7 +49,7 @@ cpumask_t *tick_get_broadcast_mask(void)
  */
 static void tick_broadcast_start_periodic(struct clock_event_device *bc)
 {
-	if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN)
+	if (bc)
 		tick_setup_periodic(bc, 1);
 }
 
@@ -299,7 +299,7 @@ void tick_suspend_broadcast(void)
 	spin_lock_irqsave(&tick_broadcast_lock, flags);
 
 	bc = tick_broadcast_device.evtdev;
-	if (bc && tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+	if (bc)
 		clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
 
 	spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -316,6 +316,8 @@ int tick_resume_broadcast(void)
 	bc = tick_broadcast_device.evtdev;
 
 	if (bc) {
+		clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME);
+
 		switch (tick_broadcast_device.mode) {
 		case TICKDEV_MODE_PERIODIC:
 			if(!cpus_empty(tick_broadcast_mask))
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index a96ec9ab3454..77a21abc8716 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -318,12 +318,17 @@ static void tick_resume(void)
 {
 	struct tick_device *td = &__get_cpu_var(tick_cpu_device);
 	unsigned long flags;
+	int broadcast = tick_resume_broadcast();
 
 	spin_lock_irqsave(&tick_device_lock, flags);
-	if (td->mode == TICKDEV_MODE_PERIODIC)
-		tick_setup_periodic(td->evtdev, 0);
-	else
-		tick_resume_oneshot();
+	clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
+
+	if (!broadcast) {
+		if (td->mode == TICKDEV_MODE_PERIODIC)
+			tick_setup_periodic(td->evtdev, 0);
+		else
+			tick_resume_oneshot();
+	}
 	spin_unlock_irqrestore(&tick_device_lock, flags);
 }
 
@@ -360,8 +365,7 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
 		break;
 
 	case CLOCK_EVT_NOTIFY_RESUME:
-		if (!tick_resume_broadcast())
-			tick_resume();
+		tick_resume();
 		break;
 
 	default:
-- 
cgit v1.3-8-gc7d7


From 82644459c592a28a3eab682f9b88d81019ddfe8b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 21 Jul 2007 04:37:37 -0700
Subject: NTP: move the cmos update code into ntp.c

i386 and sparc64 have the identical code to update the cmos clock.  Move it
into kernel/time/ntp.c as there are other architectures coming along with the
same requirements.

[akpm@linux-foundation.org: build fixes]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: john stultz <johnstul@us.ibm.com>
Cc: David Miller <davem@davemloft.net>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/Kconfig          |  4 ++++
 arch/i386/kernel/time.c    | 50 ++-------------------------------------
 arch/sparc64/Kconfig       |  4 ++++
 arch/sparc64/kernel/time.c | 53 ++---------------------------------------
 include/asm-i386/timer.h   |  1 -
 include/linux/time.h       |  3 +++
 kernel/time/ntp.c          | 59 +++++++++++++++++++++++++++++++++++++++++++---
 7 files changed, 71 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 7a11b905ef49..361aca8b3ec3 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -18,6 +18,10 @@ config GENERIC_TIME
 	bool
 	default y
 
+config GENERIC_CMOS_UPDATE
+	bool
+	default y
+
 config CLOCKSOURCE_WATCHDOG
 	bool
 	default y
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index a665df61f08c..19a6c678d02e 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -207,55 +207,9 @@ unsigned long read_persistent_clock(void)
 	return retval;
 }
 
-static void sync_cmos_clock(unsigned long dummy);
-
-static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
-int no_sync_cmos_clock;
-
-static void sync_cmos_clock(unsigned long dummy)
-{
-	struct timeval now, next;
-	int fail = 1;
-
-	/*
-	 * If we have an externally synchronized Linux clock, then update
-	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
-	 * called as close as possible to 500 ms before the new second starts.
-	 * This code is run on a timer.  If the clock is set, that timer
-	 * may not expire at the correct time.  Thus, we adjust...
-	 */
-	if (!ntp_synced())
-		/*
-		 * Not synced, exit, do not restart a timer (if one is
-		 * running, let it run out).
-		 */
-		return;
-
-	do_gettimeofday(&now);
-	if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
-	    now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
-		fail = set_rtc_mmss(now.tv_sec);
-
-	next.tv_usec = USEC_AFTER - now.tv_usec;
-	if (next.tv_usec <= 0)
-		next.tv_usec += USEC_PER_SEC;
-
-	if (!fail)
-		next.tv_sec = 659;
-	else
-		next.tv_sec = 0;
-
-	if (next.tv_usec >= USEC_PER_SEC) {
-		next.tv_sec++;
-		next.tv_usec -= USEC_PER_SEC;
-	}
-	mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
-}
-
-void notify_arch_cmos_timer(void)
+int update_persistent_clock(struct timespec now)
 {
-	if (!no_sync_cmos_clock)
-		mod_timer(&sync_cmos_timer, jiffies + 1);
+	return set_rtc_mmss(now.tv_sec);
 }
 
 extern void (*late_time_init)(void);
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index f1cc55677ff2..33dabf588bdd 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -23,6 +23,10 @@ config GENERIC_TIME
 	bool
 	default y
 
+config GENERIC_CMOS_UPDATE
+	bool
+	default y
+
 config GENERIC_CLOCKEVENTS
 	bool
 	default y
diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c
index 87c10a7544da..49063ca2efcd 100644
--- a/arch/sparc64/kernel/time.c
+++ b/arch/sparc64/kernel/time.c
@@ -403,58 +403,9 @@ static struct sparc64_tick_ops hbtick_operations __read_mostly = {
 
 static unsigned long timer_ticks_per_nsec_quotient __read_mostly;
 
-#define TICK_SIZE (tick_nsec / 1000)
-
-#define USEC_AFTER	500000
-#define USEC_BEFORE	500000
-
-static void sync_cmos_clock(unsigned long dummy);
-
-static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
-
-static void sync_cmos_clock(unsigned long dummy)
-{
-	struct timeval now, next;
-	int fail = 1;
-
-	/*
-	 * If we have an externally synchronized Linux clock, then update
-	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
-	 * called as close as possible to 500 ms before the new second starts.
-	 * This code is run on a timer.  If the clock is set, that timer
-	 * may not expire at the correct time.  Thus, we adjust...
-	 */
-	if (!ntp_synced())
-		/*
-		 * Not synced, exit, do not restart a timer (if one is
-		 * running, let it run out).
-		 */
-		return;
-
-	do_gettimeofday(&now);
-	if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
-	    now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
-		fail = set_rtc_mmss(now.tv_sec);
-
-	next.tv_usec = USEC_AFTER - now.tv_usec;
-	if (next.tv_usec <= 0)
-		next.tv_usec += USEC_PER_SEC;
-
-	if (!fail)
-		next.tv_sec = 659;
-	else
-		next.tv_sec = 0;
-
-	if (next.tv_usec >= USEC_PER_SEC) {
-		next.tv_sec++;
-		next.tv_usec -= USEC_PER_SEC;
-	}
-	mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
-}
-
-void notify_arch_cmos_timer(void)
+int update_persistent_clock(struct timespec now)
 {
-	mod_timer(&sync_cmos_timer, jiffies + 1);
+	return set_rtc_mmss(now.tv_sec);
 }
 
 /* Kick start a stopped clock (procedure from the Sun NVRAM/hostid FAQ). */
diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h
index 51a713e33a9e..b371667cfdeb 100644
--- a/include/asm-i386/timer.h
+++ b/include/asm-i386/timer.h
@@ -11,7 +11,6 @@ unsigned long native_calculate_cpu_khz(void);
 
 extern int timer_ack;
 extern int no_timer_check;
-extern int no_sync_cmos_clock;
 extern int recalibrate_cpu_khz(void);
 
 #ifndef CONFIG_PARAVIRT
diff --git a/include/linux/time.h b/include/linux/time.h
index ec3b0ced0afe..e6aea5146e5d 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -4,6 +4,7 @@
 #include <linux/types.h>
 
 #ifdef __KERNEL__
+# include <linux/cache.h>
 # include <linux/seqlock.h>
 #endif
 
@@ -94,6 +95,8 @@ extern struct timespec wall_to_monotonic;
 extern seqlock_t xtime_lock __attribute__((weak));
 
 extern unsigned long read_persistent_clock(void);
+extern int update_persistent_clock(struct timespec now);
+extern int no_sync_cmos_clock __read_mostly;
 void timekeeping_init(void);
 
 static inline unsigned long get_seconds(void)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index b5e352597cbb..cd91237dbfe3 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -10,6 +10,7 @@
 
 #include <linux/mm.h>
 #include <linux/time.h>
+#include <linux/timer.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
 #include <linux/hrtimer.h>
@@ -175,12 +176,64 @@ u64 current_tick_length(void)
 	return tick_length;
 }
 
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
 
-void __attribute__ ((weak)) notify_arch_cmos_timer(void)
+/* Disable the cmos update - used by virtualization and embedded */
+int no_sync_cmos_clock  __read_mostly;
+
+static void sync_cmos_clock(unsigned long dummy);
+
+static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
+
+static void sync_cmos_clock(unsigned long dummy)
+{
+	struct timespec now, next;
+	int fail = 1;
+
+	/*
+	 * If we have an externally synchronized Linux clock, then update
+	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
+	 * called as close as possible to 500 ms before the new second starts.
+	 * This code is run on a timer.  If the clock is set, that timer
+	 * may not expire at the correct time.  Thus, we adjust...
+	 */
+	if (!ntp_synced())
+		/*
+		 * Not synced, exit, do not restart a timer (if one is
+		 * running, let it run out).
+		 */
+		return;
+
+	getnstimeofday(&now);
+	if (abs(xtime.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
+		fail = update_persistent_clock(now);
+
+	next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec;
+	if (next.tv_nsec <= 0)
+		next.tv_nsec += NSEC_PER_SEC;
+
+	if (!fail)
+		next.tv_sec = 659;
+	else
+		next.tv_sec = 0;
+
+	if (next.tv_nsec >= NSEC_PER_SEC) {
+		next.tv_sec++;
+		next.tv_nsec -= NSEC_PER_SEC;
+	}
+	mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next));
+}
+
+static void notify_cmos_timer(void)
 {
-	return;
+	if (no_sync_cmos_clock)
+		mod_timer(&sync_cmos_timer, jiffies + 1);
 }
 
+#else
+static inline void notify_cmos_timer(void) { }
+#endif
+
 /* adjtimex mainly allows reading (and writing, if superuser) of
  * kernel time-keeping variables. used by xntpd.
  */
@@ -345,6 +398,6 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
 	txc->stbcnt	   = 0;
 	write_sequnlock_irq(&xtime_lock);
 	do_gettimeofday(&txc->time);
-	notify_arch_cmos_timer();
+	notify_cmos_timer();
 	return(result);
 }
-- 
cgit v1.3-8-gc7d7


From ae2c6dcf90c5a9ff9bd9a176cafd43a255fcc64b Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Sat, 21 Jul 2007 17:09:56 +0200
Subject: x86_64: various cleanups in NUMA scan node

In acpi_scan_nodes(), we immediately return -1 if acpi_numa <= 0, meaning
we haven't detected any underlying ACPI topology or we have explicitly
disabled its use from the command-line with numa=noacpi.

acpi_table_print_srat_entry() and acpi_table_parse_srat() are only
referenced within drivers/acpi/numa.c, so we can mark them as static and
remove their prototypes from the header file.

Likewise, pxm_to_node_map[] and node_to_pxm_map[] are only used within
drivers/acpi/numa.c, so we mark them as static and remove their externs
from the header file.

The automatic 'result' variable is unused in acpi_numa_init(), so it's
removed.

Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86_64/mm/srat.c |  6 +++---
 drivers/acpi/numa.c   | 20 ++++++++++----------
 include/linux/acpi.h  |  2 --
 3 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index a126cb73928b..0e0725db20b7 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -394,6 +394,9 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 {
 	int i;
 
+	if (acpi_numa <= 0)
+		return -1;
+
 	/* First clean up the node list */
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cutoff_node(i, start, end);
@@ -403,9 +406,6 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 		}
 	}
 
-	if (acpi_numa <= 0)
-		return -1;
-
 	if (!nodes_cover_memory()) {
 		bad_srat();
 		return -1;
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 0c9f15c54e8c..6c44b522f4d3 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -40,9 +40,9 @@ static nodemask_t nodes_found_map = NODE_MASK_NONE;
 #define NID_INVAL	-1
 
 /* maps to convert between proximity domain and logical node ID */
-static int pxm_to_node_map[MAX_PXM_DOMAINS]
+static int __cpuinitdata pxm_to_node_map[MAX_PXM_DOMAINS]
 				= { [0 ... MAX_PXM_DOMAINS - 1] = NID_INVAL };
-static int node_to_pxm_map[MAX_NUMNODES]
+static int __cpuinitdata node_to_pxm_map[MAX_NUMNODES]
 				= { [0 ... MAX_NUMNODES - 1] = PXM_INVAL };
 
 int pxm_to_node(int pxm)
@@ -83,7 +83,8 @@ void __cpuinit acpi_unmap_pxm_to_node(int node)
 	node_clear(node, nodes_found_map);
 }
 
-void __init acpi_table_print_srat_entry(struct acpi_subtable_header * header)
+static void __init
+acpi_table_print_srat_entry(struct acpi_subtable_header *header)
 {
 
 	ACPI_FUNCTION_NAME("acpi_table_print_srat_entry");
@@ -200,7 +201,7 @@ static int __init acpi_parse_srat(struct acpi_table_header *table)
 	return 0;
 }
 
-int __init
+static int __init
 acpi_table_parse_srat(enum acpi_srat_type id,
 		      acpi_table_entry_handler handler, unsigned int max_entries)
 {
@@ -211,14 +212,13 @@ acpi_table_parse_srat(enum acpi_srat_type id,
 
 int __init acpi_numa_init(void)
 {
-	int result;
-
 	/* SRAT: Static Resource Affinity Table */
 	if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
-		result = acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
-					       acpi_parse_processor_affinity,
-					       NR_CPUS);
-		result = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, acpi_parse_memory_affinity, NR_NODE_MEMBLKS);	// IA64 specific
+		acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
+				      acpi_parse_processor_affinity, NR_CPUS);
+		acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
+				      acpi_parse_memory_affinity,
+				      NR_NODE_MEMBLKS);
 	}
 
 	/* SLIT: System Locality Information Table */
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index dc234c508a6f..e88b62e6b3aa 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -88,10 +88,8 @@ int acpi_table_parse (char *id, acpi_table_handler handler);
 int __init acpi_table_parse_entries(char *id, unsigned long table_size,
 	int entry_id, acpi_table_entry_handler handler, unsigned int max_entries);
 int acpi_table_parse_madt (enum acpi_madt_type id, acpi_table_entry_handler handler, unsigned int max_entries);
-int acpi_table_parse_srat (enum acpi_srat_type id, acpi_table_entry_handler handler, unsigned int max_entries);
 int acpi_parse_mcfg (struct acpi_table_header *header);
 void acpi_table_print_madt_entry (struct acpi_subtable_header *madt);
-void acpi_table_print_srat_entry (struct acpi_subtable_header *srat);
 
 /* the following four functions are architecture-dependent */
 #ifdef CONFIG_HAVE_ARCH_PARSE_SRAT
-- 
cgit v1.3-8-gc7d7


From a586df067afe0580bb02b7a6312ca2afe49bba03 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 21 Jul 2007 17:10:00 +0200
Subject: x86: Support __attribute__((__cold__)) in gcc 4.3

gcc 4.3 supports a new __attribute__((__cold__)) to mark functions cold. Any
path directly leading to a call of this function will be unlikely. And gcc
will try to generate smaller code for the function itself.

Please use with care. The code generation advantage isn't large and in most
cases it is not worth uglifying code with this.

This patch marks some common error functions like panic(), printk()
as cold.  This will longer term make many unlikely()s unnecessary, although
we can keep them for now for older compilers.

BUG is not marked cold because there is currently no way to tell
gcc to mark a inline function told.

Also all __init and __exit functions are marked cold. With a non -Os
build this will tell the compiler to generate slightly smaller code
for them. I think it currently only uses less alignments for labels,
but that might change in the future.

One disadvantage over *likely() is that they cannot be easily instrumented
to verify them.

Another drawback is that only the latest gcc 4.3 snapshots support this.
Unfortunately we cannot detect this using the preprocessor. This means older
snapshots will fail now. I don't think that's a problem because they are
unreleased compilers that nobody should be using.

gcc also has a __hot__ attribute, but I don't see any sense in using
this in the kernel right now. But someday I hope gcc will be able
to use more aggressive optimizing for hot functions even in -Os,
if that happens it should be added.

Includes compile fix from Thomas Gleixner.

Cc: Jan Hubicka <jh@suse.cz>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc4.h | 18 ++++++++++++++++++
 include/linux/compiler.h      |  9 +++++++++
 include/linux/init.h          |  8 ++++----
 include/linux/kernel.h        |  8 ++++----
 4 files changed, 35 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index a03e9398a6c2..14f7494280f0 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -23,3 +23,21 @@
  * code
  */
 #define uninitialized_var(x) x = x
+
+#if !(__GNUC__ == 4 && __GNUC_MINOR__ < 3)
+/* Mark functions as cold. gcc will assume any path leading to a call
+   to them will be unlikely.  This means a lot of manual unlikely()s
+   are unnecessary now for any paths leading to the usual suspects
+   like BUG(), printk(), panic() etc. [but let's keep them for now for
+   older compilers]
+
+   Early snapshots of gcc 4.3 don't support this and we can't detect this
+   in the preprocessor, but we can live with this because they're unreleased.
+   Maketime probing would be overkill here.
+
+   gcc also has a __attribute__((__hot__)) to move hot functions into
+   a special section, but I don't see any sense in this right now in
+   the kernel context */
+#define __cold			__attribute__((__cold__))
+
+#endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 8287a72bb6a9..12a1291855e2 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -174,4 +174,13 @@ extern void __chk_io_ptr(const void __iomem *);
 # define __attribute_const__	/* unimplemented */
 #endif
 
+/*
+ * Tell gcc if a function is cold. The compiler will assume any path
+ * directly leading to the call is unlikely.
+ */
+
+#ifndef __cold
+#define __cold
+#endif
+
 #endif /* __LINUX_COMPILER_H */
diff --git a/include/linux/init.h b/include/linux/init.h
index 5b5285316339..f0d0e3295a9b 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -40,10 +40,10 @@
 
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
-#define __init		__attribute__ ((__section__ (".init.text")))
+#define __init		__attribute__ ((__section__ (".init.text"))) __cold
 #define __initdata	__attribute__ ((__section__ (".init.data")))
 #define __exitdata	__attribute__ ((__section__(".exit.data")))
-#define __exit_call	__attribute_used__ __attribute__ ((__section__ (".exitcall.exit")))
+#define __exit_call	__attribute_used__ __attribute__ ((__section__ (".exitcall.exit"))) __cold
 
 /* modpost check for section mismatches during the kernel build.
  * A section mismatch happens when there are references from a
@@ -59,9 +59,9 @@
 #define __initdata_refok          __attribute__ ((__section__ (".data.init.refok")))
 
 #ifdef MODULE
-#define __exit		__attribute__ ((__section__(".exit.text")))
+#define __exit		__attribute__ ((__section__(".exit.text"))) __cold
 #else
-#define __exit		__attribute_used__ __attribute__ ((__section__(".exit.text")))
+#define __exit		__attribute_used__ __attribute__ ((__section__(".exit.text"))) __cold
 #endif
 
 /* For assembly routines */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 1eb9cde550c4..4300bb462d29 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -106,7 +106,7 @@ extern int cond_resched(void);
 extern struct atomic_notifier_head panic_notifier_list;
 extern long (*panic_blink)(long time);
 NORET_TYPE void panic(const char * fmt, ...)
-	__attribute__ ((NORET_AND format (printf, 1, 2)));
+	__attribute__ ((NORET_AND format (printf, 1, 2))) __cold;
 extern void oops_enter(void);
 extern void oops_exit(void);
 extern int oops_may_print(void);
@@ -155,14 +155,14 @@ extern void dump_thread(struct pt_regs *regs, struct user *dump);
 asmlinkage int vprintk(const char *fmt, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
 asmlinkage int printk(const char * fmt, ...)
-	__attribute__ ((format (printf, 1, 2)));
+	__attribute__ ((format (printf, 1, 2))) __cold;
 #else
 static inline int vprintk(const char *s, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
 static inline int vprintk(const char *s, va_list args) { return 0; }
 static inline int printk(const char *s, ...)
 	__attribute__ ((format (printf, 1, 2)));
-static inline int printk(const char *s, ...) { return 0; }
+static inline int __cold printk(const char *s, ...) { return 0; }
 #endif
 
 unsigned long int_sqrt(unsigned long);
@@ -212,7 +212,7 @@ extern enum system_states {
 #define TAINT_USER			(1<<6)
 #define TAINT_DIE			(1<<7)
 
-extern void dump_stack(void);
+extern void dump_stack(void) __cold;
 
 enum {
 	DUMP_PREFIX_NONE,
-- 
cgit v1.3-8-gc7d7


From 3484d79813707bb6045773953a809abba443dc20 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Sat, 21 Jul 2007 17:10:32 +0200
Subject: x86_64: fake pxm-to-node mapping for fake numa

For NUMA emulation, our SLIT should represent the true NUMA topology of the
system but our proximity domain to node ID mapping needs to reflect the
emulated state.

When NUMA emulation has successfully setup fake nodes on the system, a new
function, acpi_fake_nodes() is called.  This function determines the proximity
domain (_PXM) for each true node found on the system.  It then finds which
emulated nodes have been allocated on this true node as determined by its
starting address.  The node ID to PXM mapping is changed so that each fake
node ID points to the PXM of the true node that it is located on.

If the machine failed to register a SLIT, then we assume there is no special
requirement for emulated node affinity so we use the default LOCAL_DISTANCE,
which is newly exported to this code, as our measurement if the emulated nodes
appear in the same PXM.  Otherwise, we use REMOTE_DISTANCE.

PXM_INVAL and NID_INVAL are also exported to the ACPI header file so that we
can compare node_to_pxm() results in generic code (in this case, the SRAT
code).

Cc: Len Brown <lenb@kernel.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86_64/mm/numa.c     |  1 +
 arch/x86_64/mm/srat.c     | 76 +++++++++++++++++++++++++++++++++++++++++++++--
 drivers/acpi/numa.c       | 11 ++++---
 include/acpi/acpi_numa.h  |  1 +
 include/asm-x86_64/acpi.h | 11 +++++++
 include/linux/acpi.h      |  3 ++
 6 files changed, 96 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 51548947ad3b..30bf8043984d 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -484,6 +484,7 @@ out:
 						nodes[i].end >> PAGE_SHIFT);
  		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 	}
+	acpi_fake_nodes(nodes, num_nodes);
  	numa_init_array();
  	return 0;
 }
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 0e0725db20b7..7ac8ff333e84 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -350,7 +350,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 
 /* Sanity check to catch more bad SRATs (they are amazingly common).
    Make sure the PXMs cover all memory. */
-static int nodes_cover_memory(void)
+static int __init nodes_cover_memory(const struct bootnode *nodes)
 {
 	int i;
 	unsigned long pxmram, e820ram;
@@ -406,7 +406,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 		}
 	}
 
-	if (!nodes_cover_memory()) {
+	if (!nodes_cover_memory(nodes)) {
 		bad_srat();
 		return -1;
 	}
@@ -440,6 +440,75 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 	return 0;
 }
 
+#ifdef CONFIG_NUMA_EMU
+static int __init find_node_by_addr(unsigned long addr)
+{
+	int ret = NUMA_NO_NODE;
+	int i;
+
+	for_each_node_mask(i, nodes_parsed) {
+		/*
+		 * Find the real node that this emulated node appears on.  For
+		 * the sake of simplicity, we only use a real node's starting
+		 * address to determine which emulated node it appears on.
+		 */
+		if (addr >= nodes[i].start && addr < nodes[i].end) {
+			ret = i;
+			break;
+		}
+	}
+	return i;
+}
+
+/*
+ * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
+ * mappings that respect the real ACPI topology but reflect our emulated
+ * environment.  For each emulated node, we find which real node it appears on
+ * and create PXM to NID mappings for those fake nodes which mirror that
+ * locality.  SLIT will now represent the correct distances between emulated
+ * nodes as a result of the real topology.
+ */
+void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
+{
+	int i;
+	int fake_node_to_pxm_map[MAX_NUMNODES] = {
+		[0 ... MAX_NUMNODES-1] = PXM_INVAL
+	};
+
+	printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
+			 "topology.\n");
+	for (i = 0; i < num_nodes; i++) {
+		int nid, pxm;
+
+		nid = find_node_by_addr(fake_nodes[i].start);
+		if (nid == NUMA_NO_NODE)
+			continue;
+		pxm = node_to_pxm(nid);
+		if (pxm == PXM_INVAL)
+			continue;
+		fake_node_to_pxm_map[i] = pxm;
+	}
+	for (i = 0; i < num_nodes; i++)
+		__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
+
+	nodes_clear(nodes_parsed);
+	for (i = 0; i < num_nodes; i++)
+		if (fake_nodes[i].start != fake_nodes[i].end)
+			node_set(i, nodes_parsed);
+	WARN_ON(!nodes_cover_memory(fake_nodes));
+}
+
+static int null_slit_node_compare(int a, int b)
+{
+	return node_to_pxm(a) == node_to_pxm(b);
+}
+#else
+static int null_slit_node_compare(int a, int b)
+{
+	return a == b;
+}
+#endif /* CONFIG_NUMA_EMU */
+
 void __init srat_reserve_add_area(int nodeid)
 {
 	if (found_add_area && nodes_add[nodeid].end) {
@@ -464,7 +533,8 @@ int __node_distance(int a, int b)
 	int index;
 
 	if (!acpi_slit)
-		return a == b ? LOCAL_DISTANCE : REMOTE_DISTANCE;
+		return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
+						      REMOTE_DISTANCE;
 	index = acpi_slit->locality_count * node_to_pxm(a);
 	return acpi_slit->entry[index + node_to_pxm(b)];
 }
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 6c44b522f4d3..ab04d848b19d 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -36,8 +36,6 @@
 ACPI_MODULE_NAME("numa");
 
 static nodemask_t nodes_found_map = NODE_MASK_NONE;
-#define PXM_INVAL	-1
-#define NID_INVAL	-1
 
 /* maps to convert between proximity domain and logical node ID */
 static int __cpuinitdata pxm_to_node_map[MAX_PXM_DOMAINS]
@@ -59,6 +57,12 @@ int node_to_pxm(int node)
 	return node_to_pxm_map[node];
 }
 
+void __acpi_map_pxm_to_node(int pxm, int node)
+{
+	pxm_to_node_map[pxm] = node;
+	node_to_pxm_map[node] = pxm;
+}
+
 int acpi_map_pxm_to_node(int pxm)
 {
 	int node = pxm_to_node_map[pxm];
@@ -67,8 +71,7 @@ int acpi_map_pxm_to_node(int pxm)
 		if (nodes_weight(nodes_found_map) >= MAX_NUMNODES)
 			return NID_INVAL;
 		node = first_unset_node(nodes_found_map);
-		pxm_to_node_map[pxm] = node;
-		node_to_pxm_map[node] = pxm;
+		__acpi_map_pxm_to_node(pxm, node);
 		node_set(node, nodes_found_map);
 	}
 
diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h
index e2fcee2b340d..62c5ee4311da 100644
--- a/include/acpi/acpi_numa.h
+++ b/include/acpi/acpi_numa.h
@@ -13,6 +13,7 @@
 
 extern int pxm_to_node(int);
 extern int node_to_pxm(int);
+extern void __acpi_map_pxm_to_node(int, int);
 extern int acpi_map_pxm_to_node(int);
 extern void __cpuinit acpi_unmap_pxm_to_node(int);
 
diff --git a/include/asm-x86_64/acpi.h b/include/asm-x86_64/acpi.h
index a29f05087a31..1da8f49c0fe2 100644
--- a/include/asm-x86_64/acpi.h
+++ b/include/asm-x86_64/acpi.h
@@ -29,6 +29,7 @@
 #ifdef __KERNEL__
 
 #include <acpi/pdc_intel.h>
+#include <asm/numa.h>
 
 #define COMPILER_DEPENDENT_INT64   long long
 #define COMPILER_DEPENDENT_UINT64  unsigned long long
@@ -141,6 +142,16 @@ extern int acpi_pci_disabled;
 extern int acpi_skip_timer_override;
 extern int acpi_use_timer_override;
 
+#ifdef CONFIG_ACPI_NUMA
+extern void __init acpi_fake_nodes(const struct bootnode *fake_nodes,
+				   int num_nodes);
+#else
+static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
+				   int num_nodes)
+{
+}
+#endif
+
 #endif /*__KERNEL__*/
 
 #endif /*_ASM_ACPI_H*/
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e88b62e6b3aa..d5680cd7746a 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -231,6 +231,9 @@ extern int acpi_paddr_to_node(u64 start_addr, u64 size);
 
 extern int pnpacpi_disabled;
 
+#define PXM_INVAL	(-1)
+#define NID_INVAL	(-1)
+
 #else	/* CONFIG_ACPI */
 
 static inline int acpi_boot_init(void)
-- 
cgit v1.3-8-gc7d7


From 44bf4cea43816d43deab73c1c16361e899996eaa Mon Sep 17 00:00:00 2001
From: Nigel Cunningham <nigel@nigel.suspend2.net>
Date: Sat, 21 Jul 2007 17:10:41 +0200
Subject: x86: PM_TRACE support

Signed-off-by: Nigel Cunningham <nigel@nigel.suspend2.net>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86_64/kernel/vmlinux.lds.S  |  7 +++++++
 drivers/base/power/trace.c        |  5 ++++-
 include/asm-i386/resume-trace.h   | 13 +++++++++++++
 include/asm-x86_64/resume-trace.h | 13 +++++++++++++
 include/linux/resume-trace.h      | 19 +++++--------------
 kernel/power/Kconfig              |  2 +-
 6 files changed, 43 insertions(+), 16 deletions(-)
 create mode 100644 include/asm-i386/resume-trace.h
 create mode 100644 include/asm-x86_64/resume-trace.h

(limited to 'include/linux')

diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index c2d5a840cb1a..8778848000c2 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -54,6 +54,13 @@ SECTIONS
 
   RODATA
 
+  . = ALIGN(4);
+  .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
+  	__tracedata_start = .;
+	*(.tracedata)
+  	__tracedata_end = .;
+  }
+
   . = ALIGN(PAGE_SIZE);        /* Align data segment to page size boundary */
 				/* Data */
   .data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c
index a9ab30fefffc..2b0c601e422e 100644
--- a/drivers/base/power/trace.c
+++ b/drivers/base/power/trace.c
@@ -142,6 +142,7 @@ void set_trace_device(struct device *dev)
 {
 	dev_hash_value = hash_string(DEVSEED, dev->bus_id, DEVHASH);
 }
+EXPORT_SYMBOL(set_trace_device);
 
 /*
  * We could just take the "tracedata" index into the .tracedata
@@ -162,6 +163,7 @@ void generate_resume_trace(void *tracedata, unsigned int user)
 	file_hash_value = hash_string(lineno, file, FILEHASH);
 	set_magic_time(user_hash_value, file_hash_value, dev_hash_value);
 }
+EXPORT_SYMBOL(generate_resume_trace);
 
 extern char __tracedata_start, __tracedata_end;
 static int show_file_hash(unsigned int value)
@@ -170,7 +172,8 @@ static int show_file_hash(unsigned int value)
 	char *tracedata;
 
 	match = 0;
-	for (tracedata = &__tracedata_start ; tracedata < &__tracedata_end ; tracedata += 6) {
+	for (tracedata = &__tracedata_start ; tracedata < &__tracedata_end ;
+			tracedata += 2 + sizeof(unsigned long)) {
 		unsigned short lineno = *(unsigned short *)tracedata;
 		const char *file = *(const char **)(tracedata + 2);
 		unsigned int hash = hash_string(lineno, file, FILEHASH);
diff --git a/include/asm-i386/resume-trace.h b/include/asm-i386/resume-trace.h
new file mode 100644
index 000000000000..ec9cfd656230
--- /dev/null
+++ b/include/asm-i386/resume-trace.h
@@ -0,0 +1,13 @@
+#define TRACE_RESUME(user) do {					\
+	if (pm_trace_enabled) {					\
+		void *tracedata;				\
+		asm volatile("movl $1f,%0\n"			\
+			".section .tracedata,\"a\"\n"		\
+			"1:\t.word %c1\n"			\
+			"\t.long %c2\n"				\
+			".previous"				\
+			:"=r" (tracedata)			\
+			: "i" (__LINE__), "i" (__FILE__));	\
+		generate_resume_trace(tracedata, user);		\
+	}							\
+} while (0)
diff --git a/include/asm-x86_64/resume-trace.h b/include/asm-x86_64/resume-trace.h
new file mode 100644
index 000000000000..34bf998fdf62
--- /dev/null
+++ b/include/asm-x86_64/resume-trace.h
@@ -0,0 +1,13 @@
+#define TRACE_RESUME(user) do {					\
+	if (pm_trace_enabled) {					\
+		void *tracedata;				\
+		asm volatile("movq $1f,%0\n"			\
+			".section .tracedata,\"a\"\n"		\
+			"1:\t.word %c1\n"			\
+			"\t.quad %c2\n"				\
+			".previous"				\
+			:"=r" (tracedata)			\
+			: "i" (__LINE__), "i" (__FILE__));	\
+		generate_resume_trace(tracedata, user);		\
+	}							\
+} while (0)
diff --git a/include/linux/resume-trace.h b/include/linux/resume-trace.h
index 81e9299ca148..f3f4f28c6960 100644
--- a/include/linux/resume-trace.h
+++ b/include/linux/resume-trace.h
@@ -2,6 +2,7 @@
 #define RESUME_TRACE_H
 
 #ifdef CONFIG_PM_TRACE
+#include <asm/resume-trace.h>
 
 extern int pm_trace_enabled;
 
@@ -9,20 +10,10 @@ struct device;
 extern void set_trace_device(struct device *);
 extern void generate_resume_trace(void *tracedata, unsigned int user);
 
-#define TRACE_DEVICE(dev) set_trace_device(dev)
-#define TRACE_RESUME(user) do {					\
-	if (pm_trace_enabled) {					\
-		void *tracedata;				\
-		asm volatile("movl $1f,%0\n"			\
-			".section .tracedata,\"a\"\n"		\
-			"1:\t.word %c1\n"			\
-			"\t.long %c2\n"				\
-			".previous"				\
-			:"=r" (tracedata)			\
-			: "i" (__LINE__), "i" (__FILE__));	\
-		generate_resume_trace(tracedata, user);		\
-	}							\
-} while (0)
+#define TRACE_DEVICE(dev) do { \
+	if (pm_trace_enabled) \
+		set_trace_device(dev); \
+	} while(0)
 
 #else
 
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 7358609e4735..c1a106d87d90 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -57,7 +57,7 @@ config DISABLE_CONSOLE_SUSPEND
 
 config PM_TRACE
 	bool "Suspend/resume event tracing"
-	depends on PM_DEBUG && X86_32 && EXPERIMENTAL
+	depends on PM_DEBUG && X86 && EXPERIMENTAL
 	default n
 	---help---
 	This enables some cheesy code to save the last PM event point in the
-- 
cgit v1.3-8-gc7d7


From 9585116ba09f1d8c52d0a1346e20bb9d443e9c02 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Sat, 21 Jul 2007 17:11:35 +0200
Subject: i386: fix iounmap's use of vm_struct's size field

get_vm_area always returns an area with an adjacent guard page.  That guard
page is included in vm_struct.size.  iounmap uses vm_struct.size to
determine how much address space needs to have change_page_attr applied to
it, which will BUG if applied to the guard page.

This patch adds a helper function - get_vm_area_size() in linux/vmalloc.h -
to return the actual size of a vm area, and uses it to make iounmap do the
right thing.  There are probably other places which should be using
get_vm_area_size().

Thanks to Dave Young <hidave.darkstar@gmail.com> for debugging the
problem.

[ Andi, it wasn't clear to me whether x86_64 needs the same fix. ]

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Dave Young <hidave.darkstar@gmail.com>
Cc: Chuck Ebbert <cebbert@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/mm/ioremap.c  | 2 +-
 include/linux/vmalloc.h | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c
index fff08ae7b5ed..0b278315d737 100644
--- a/arch/i386/mm/ioremap.c
+++ b/arch/i386/mm/ioremap.c
@@ -196,7 +196,7 @@ void iounmap(volatile void __iomem *addr)
 	/* Reset the direct mapping. Can block */
 	if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) {
 		change_page_attr(virt_to_page(__va(p->phys_addr)),
-				 p->size >> PAGE_SHIFT,
+				 get_vm_area_size(p) >> PAGE_SHIFT,
 				 PAGE_KERNEL);
 		global_flush_tlb();
 	} 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index c2b10cae5da5..89338b468d0d 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -58,6 +58,13 @@ void vmalloc_sync_all(void);
 /*
  *	Lowlevel-APIs (not for driver use!)
  */
+
+static inline size_t get_vm_area_size(const struct vm_struct *area)
+{
+	/* return actual size without guard page */
+	return area->size - PAGE_SIZE;
+}
+
 extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags);
 extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 					unsigned long start, unsigned long end);
-- 
cgit v1.3-8-gc7d7


From e97e2ddf07d6b6c2d621ddaec277e19f86c0cdb1 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <samuel@sortiz.org>
Date: Sat, 21 Jul 2007 19:07:33 -0700
Subject: [IrDA]: EP7211 IR driver port to the latest SIR API

The EP7211 SIR driver was the only one left without a new SIR API port.

Signed-off-by: Samuel Ortiz <samuel@sortiz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/irda/Kconfig      |  9 +++++
 drivers/net/irda/Makefile     |  1 +
 drivers/net/irda/ep7211-sir.c | 89 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/irda.h          |  1 +
 4 files changed, 100 insertions(+)
 create mode 100644 drivers/net/irda/ep7211-sir.c

(limited to 'include/linux')

diff --git a/drivers/net/irda/Kconfig b/drivers/net/irda/Kconfig
index 829da9a1d113..266ba7d8f3d1 100644
--- a/drivers/net/irda/Kconfig
+++ b/drivers/net/irda/Kconfig
@@ -155,6 +155,15 @@ config KINGSUN_DONGLE
 	  To compile it as a module, choose M here: the module will be called
 	  kingsun-sir.
 
+config EP7211_DONGLE
+	tristate "EP7211 I/R support"
+	depends on IRTTY_SIR && ARCH_EP7211 && IRDA && EXPERIMENTAL
+	help
+	  Say Y here if you want to build support for the Cirrus logic
+	  EP7211 chipset's infrared module.
+
+
+
 comment "Old SIR device drivers"
 
 config IRPORT_SIR
diff --git a/drivers/net/irda/Makefile b/drivers/net/irda/Makefile
index 233a2f923730..2808ef5c7b79 100644
--- a/drivers/net/irda/Makefile
+++ b/drivers/net/irda/Makefile
@@ -45,6 +45,7 @@ obj-$(CONFIG_MCP2120_DONGLE)	+= mcp2120-sir.o
 obj-$(CONFIG_ACT200L_DONGLE)	+= act200l-sir.o
 obj-$(CONFIG_MA600_DONGLE)	+= ma600-sir.o
 obj-$(CONFIG_TOIM3232_DONGLE)	+= toim3232-sir.o
+obj-$(CONFIG_EP7211_DONGLE)	+= ep7211-sir.o
 obj-$(CONFIG_KINGSUN_DONGLE)	+= kingsun-sir.o
 
 # The SIR helper module
diff --git a/drivers/net/irda/ep7211-sir.c b/drivers/net/irda/ep7211-sir.c
new file mode 100644
index 000000000000..831572429bb9
--- /dev/null
+++ b/drivers/net/irda/ep7211-sir.c
@@ -0,0 +1,89 @@
+/*
+ * IR port driver for the Cirrus Logic EP7211 processor.
+ *
+ * Copyright 2001, Blue Mug Inc.  All rights reserved.
+ * Copyright 2007, Samuel Ortiz <samuel@sortiz.org>
+ */
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/tty.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irda_device.h>
+
+#include <asm/io.h>
+#include <asm/hardware.h>
+
+#include "sir-dev.h"
+
+#define MIN_DELAY 25      /* 15 us, but wait a little more to be sure */
+#define MAX_DELAY 10000   /* 1 ms */
+
+static int ep7211_open(struct sir_dev *dev);
+static int ep7211_close(struct sir_dev *dev);
+static int ep7211_change_speed(struct sir_dev *dev, unsigned speed);
+static int ep7211_reset(struct sir_dev *dev);
+
+static struct dongle_driver ep7211 = {
+	.owner		= THIS_MODULE,
+	.driver_name	= "EP7211 IR driver",
+	.type		= IRDA_EP7211_DONGLE,
+	.open		= ep7211_open,
+	.close		= ep7211_close,
+	.reset		= ep7211_reset,
+	.set_speed	= ep7211_change_speed,
+};
+
+static int __init ep7211_sir_init(void)
+{
+	return irda_register_dongle(&ep7211);
+}
+
+static void __exit ep7211_sir_cleanup(void)
+{
+	irda_unregister_dongle(&ep7211);
+}
+
+static int ep7211_open(struct sir_dev *dev)
+{
+	unsigned int syscon;
+
+	/* Turn on the SIR encoder. */
+	syscon = clps_readl(SYSCON1);
+	syscon |= SYSCON1_SIREN;
+	clps_writel(syscon, SYSCON1);
+
+	return 0;
+}
+
+static int ep7211_close(struct sir_dev *dev)
+{
+	unsigned int syscon;
+
+	/* Turn off the SIR encoder. */
+	syscon = clps_readl(SYSCON1);
+	syscon &= ~SYSCON1_SIREN;
+	clps_writel(syscon, SYSCON1);
+
+	return 0;
+}
+
+static int ep7211_change_speed(struct sir_dev *dev, unsigned speed)
+{
+	return 0;
+}
+
+static int ep7211_reset(struct sir_dev *dev)
+{
+	return 0;
+}
+
+MODULE_AUTHOR("Samuel Ortiz <samuel@sortiz.org>");
+MODULE_DESCRIPTION("EP7211 IR dongle driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("irda-dongle-13"); /* IRDA_EP7211_DONGLE */
+
+module_init(ep7211_sir_init);
+module_exit(ep7211_sir_cleanup);
diff --git a/include/linux/irda.h b/include/linux/irda.h
index 8e3735714c1c..28f88ecba344 100644
--- a/include/linux/irda.h
+++ b/include/linux/irda.h
@@ -77,6 +77,7 @@ typedef enum {
 	IRDA_ACT200L_DONGLE      = 10,
 	IRDA_MA600_DONGLE        = 11,
 	IRDA_TOIM3232_DONGLE     = 12,
+	IRDA_EP7211_DONGLE       = 13,
 } IRDA_DONGLE;
 
 /* Protocol types to be used for SOCK_DGRAM */
-- 
cgit v1.3-8-gc7d7


From d5a2f2f1d68e2da538ac28540cddd9ccc733b001 Mon Sep 17 00:00:00 2001
From: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Date: Wed, 18 Jul 2007 23:45:42 -0300
Subject: ACPI: thinkpad-acpi: store ThinkPad model information

Keep note of ThinkPad model, BIOS and EC firmware information, and log it
on startup.  Makes for far more readable code in places, too.

This patch also adds Lenovo's PCI ID to the pci ids table.

Signed-off-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/misc/thinkpad_acpi.c | 98 +++++++++++++++++++++++++++++++-------------
 drivers/misc/thinkpad_acpi.h | 17 +++++++-
 include/linux/pci_ids.h      |  2 +
 3 files changed, 87 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/thinkpad_acpi.c b/drivers/misc/thinkpad_acpi.c
index 44aa8c92f91f..99500af651c1 100644
--- a/drivers/misc/thinkpad_acpi.c
+++ b/drivers/misc/thinkpad_acpi.c
@@ -717,9 +717,19 @@ static int __init thinkpad_acpi_driver_init(struct ibm_init_struct *iibm)
 	printk(IBM_INFO "%s v%s\n", IBM_DESC, IBM_VERSION);
 	printk(IBM_INFO "%s\n", IBM_URL);
 
-	if (ibm_thinkpad_ec_found)
-		printk(IBM_INFO "ThinkPad EC firmware %s\n",
-		       ibm_thinkpad_ec_found);
+	printk(IBM_INFO "ThinkPad BIOS %s, EC %s\n",
+		(thinkpad_id.bios_version_str) ?
+			thinkpad_id.bios_version_str : "unknown",
+		(thinkpad_id.ec_version_str) ?
+			thinkpad_id.ec_version_str : "unknown");
+
+	if (thinkpad_id.vendor && thinkpad_id.model_str)
+		printk(IBM_INFO "%s %s\n",
+			(thinkpad_id.vendor == PCI_VENDOR_ID_IBM) ?
+				"IBM" : ((thinkpad_id.vendor ==
+						PCI_VENDOR_ID_LENOVO) ?
+					"Lenovo" : "Unknown vendor"),
+			thinkpad_id.model_str);
 
 	return 0;
 }
@@ -2648,7 +2658,7 @@ static int __init thermal_init(struct ibm_init_struct *iibm)
 
 	acpi_tmp7 = acpi_evalf(ec_handle, NULL, "TMP7", "qv");
 
-	if (ibm_thinkpad_ec_found && experimental) {
+	if (thinkpad_id.ec_model && experimental) {
 		/*
 		 * Direct EC access mode: sensors at registers
 		 * 0x78-0x7F, 0xC0-0xC7.  Registers return 0x00 for
@@ -3532,20 +3542,19 @@ static int __init fan_init(struct ibm_init_struct *iibm)
 			 * Enable for TP-1Y (T43), TP-78 (R51e),
 			 * TP-76 (R52), TP-70 (T43, R52), which are known
 			 * to be buggy. */
-			if (fan_control_initial_status == 0x07 &&
-			    ibm_thinkpad_ec_found &&
-			    ((ibm_thinkpad_ec_found[0] == '1' &&
-			      ibm_thinkpad_ec_found[1] == 'Y') ||
-			     (ibm_thinkpad_ec_found[0] == '7' &&
-			      (ibm_thinkpad_ec_found[1] == '6' ||
-			       ibm_thinkpad_ec_found[1] == '8' ||
-			       ibm_thinkpad_ec_found[1] == '0'))
-			    )) {
-				printk(IBM_NOTICE
-				       "fan_init: initial fan status is "
-				       "unknown, assuming it is in auto "
-				       "mode\n");
-				tp_features.fan_ctrl_status_undef = 1;
+			if (fan_control_initial_status == 0x07) {
+				switch (thinkpad_id.ec_model) {
+				case 0x5931: /* TP-1Y */
+				case 0x3837: /* TP-78 */
+				case 0x3637: /* TP-76 */
+				case 0x3037: /* TP-70 */
+					printk(IBM_NOTICE
+					       "fan_init: initial fan status is "
+					       "unknown, assuming it is in auto "
+					       "mode\n");
+					tp_features.fan_ctrl_status_undef = 1;
+					;;
+				}
 			}
 		} else {
 			printk(IBM_ERR
@@ -4279,13 +4288,30 @@ static void ibm_exit(struct ibm_struct *ibm)
 
 /* Probing */
 
-static char *ibm_thinkpad_ec_found;
-
-static char* __init check_dmi_for_ec(void)
+static void __init get_thinkpad_model_data(struct thinkpad_id_data *tp)
 {
 	struct dmi_device *dev = NULL;
 	char ec_fw_string[18];
 
+	if (!tp)
+		return;
+
+	memset(tp, 0, sizeof(*tp));
+
+	if (dmi_name_in_vendors("IBM"))
+		tp->vendor = PCI_VENDOR_ID_IBM;
+	else if (dmi_name_in_vendors("LENOVO"))
+		tp->vendor = PCI_VENDOR_ID_LENOVO;
+	else
+		return;
+
+	tp->bios_version_str = kstrdup(dmi_get_system_info(DMI_BIOS_VERSION),
+					GFP_KERNEL);
+	if (!tp->bios_version_str)
+		return;
+	tp->bios_model = tp->bios_version_str[0]
+			 | (tp->bios_version_str[1] << 8);
+
 	/*
 	 * ThinkPad T23 or newer, A31 or newer, R50e or newer,
 	 * X32 or newer, all Z series;  Some models must have an
@@ -4299,10 +4325,20 @@ static char* __init check_dmi_for_ec(void)
 			   ec_fw_string) == 1) {
 			ec_fw_string[sizeof(ec_fw_string) - 1] = 0;
 			ec_fw_string[strcspn(ec_fw_string, " ]")] = 0;
-			return kstrdup(ec_fw_string, GFP_KERNEL);
+
+			tp->ec_version_str = kstrdup(ec_fw_string, GFP_KERNEL);
+			tp->ec_model = ec_fw_string[0]
+					| (ec_fw_string[1] << 8);
+			break;
 		}
 	}
-	return NULL;
+
+	tp->model_str = kstrdup(dmi_get_system_info(DMI_PRODUCT_VERSION),
+					GFP_KERNEL);
+	if (strnicmp(tp->model_str, "ThinkPad", 8) != 0) {
+		kfree(tp->model_str);
+		tp->model_str = NULL;
+	}
 }
 
 static int __init probe_for_thinkpad(void)
@@ -4316,7 +4352,7 @@ static int __init probe_for_thinkpad(void)
 	 * Non-ancient models have better DMI tagging, but very old models
 	 * don't.
 	 */
-	is_thinkpad = dmi_name_in_vendors("ThinkPad");
+	is_thinkpad = (thinkpad_id.model_str != NULL);
 
 	/* ec is required because many other handles are relative to it */
 	IBM_ACPIHANDLE_INIT(ec);
@@ -4332,7 +4368,7 @@ static int __init probe_for_thinkpad(void)
 	 * false positives a damn great deal
 	 */
 	if (!is_thinkpad)
-		is_thinkpad = dmi_name_in_vendors("IBM");
+		is_thinkpad = (thinkpad_id.vendor == PCI_VENDOR_ID_IBM);
 
 	if (!is_thinkpad && !force_load)
 		return -ENODEV;
@@ -4475,12 +4511,16 @@ static int __init thinkpad_acpi_module_init(void)
 	int ret, i;
 
 	/* Driver-level probe */
+
+	get_thinkpad_model_data(&thinkpad_id);
 	ret = probe_for_thinkpad();
-	if (ret)
+	if (ret) {
+		thinkpad_acpi_module_exit();
 		return ret;
+	}
 
 	/* Driver initialization */
-	ibm_thinkpad_ec_found = check_dmi_for_ec();
+
 	IBM_ACPIHANDLE_INIT(ecrd);
 	IBM_ACPIHANDLE_INIT(ecwr);
 
@@ -4590,7 +4630,9 @@ static void thinkpad_acpi_module_exit(void)
 	if (proc_dir)
 		remove_proc_entry(IBM_PROC_DIR, acpi_root_dir);
 
-	kfree(ibm_thinkpad_ec_found);
+	kfree(thinkpad_id.bios_version_str);
+	kfree(thinkpad_id.ec_version_str);
+	kfree(thinkpad_id.model_str);
 }
 
 module_init(thinkpad_acpi_module_init);
diff --git a/drivers/misc/thinkpad_acpi.h b/drivers/misc/thinkpad_acpi.h
index fee04214a10b..09b2282fed0b 100644
--- a/drivers/misc/thinkpad_acpi.h
+++ b/drivers/misc/thinkpad_acpi.h
@@ -175,9 +175,7 @@ static void tpacpi_remove_driver_attributes(struct device_driver *drv);
 static int experimental;
 static u32 dbg_level;
 static int force_load;
-static char *ibm_thinkpad_ec_found;
 
-static char* check_dmi_for_ec(void);
 static int thinkpad_acpi_module_init(void);
 static void thinkpad_acpi_module_exit(void);
 
@@ -244,6 +242,21 @@ static struct {
 	u16 input_device_registered:1;
 } tp_features;
 
+struct thinkpad_id_data {
+	unsigned int vendor;	/* ThinkPad vendor:
+				 * PCI_VENDOR_ID_IBM/PCI_VENDOR_ID_LENOVO */
+
+	char *bios_version_str;	/* Something like 1ZET51WW (1.03z) */
+	char *ec_version_str;	/* Something like 1ZHT51WW-1.04a */
+
+	u16 bios_model;		/* Big Endian, TP-1Y = 0x5931, 0 = unknown */
+	u16 ec_model;
+
+	char *model_str;
+};
+
+static struct thinkpad_id_data thinkpad_id;
+
 static struct list_head tpacpi_all_drivers;
 
 static struct ibm_init_struct ibms_init[];
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index b15c6498fe67..ced4d3f76104 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2040,6 +2040,8 @@
 #define PCI_DEVICE_ID_ALTIMA_AC9100	0x03ea
 #define PCI_DEVICE_ID_ALTIMA_AC1003	0x03eb
 
+#define PCI_VENDOR_ID_LENOVO		0x17aa
+
 #define PCI_VENDOR_ID_ARECA		0x17d3
 #define PCI_DEVICE_ID_ARECA_1110	0x1110
 #define PCI_DEVICE_ID_ARECA_1120	0x1120
-- 
cgit v1.3-8-gc7d7


From 8bf8df7120006b8c97ad3a9fcc79e2ba894c46dd Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Sun, 22 Jul 2007 00:23:03 +1000
Subject: [POWERPC] Constify of_platform_driver name

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 drivers/pcmcia/m8xx_pcmcia.c | 2 +-
 include/linux/of_platform.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pcmcia/m8xx_pcmcia.c b/drivers/pcmcia/m8xx_pcmcia.c
index 3c45142c40b2..b01985498460 100644
--- a/drivers/pcmcia/m8xx_pcmcia.c
+++ b/drivers/pcmcia/m8xx_pcmcia.c
@@ -1316,7 +1316,7 @@ static struct of_device_id m8xx_pcmcia_match[] = {
 MODULE_DEVICE_TABLE(of, m8xx_pcmcia_match);
 
 static struct of_platform_driver m8xx_pcmcia_driver = {
-	.name = (char *)driver_name,
+	.name = driver_name,
 	.match_table = m8xx_pcmcia_match,
 	.probe = m8xx_probe,
 	.remove = m8xx_remove,
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 5fd44e63fb26..22c3837784b2 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -31,7 +31,7 @@ extern struct bus_type of_platform_bus_type;
  */
 struct of_platform_driver
 {
-	char			*name;
+	const char		*name;
 	struct of_device_id	*match_table;
 	struct module		*owner;
 
-- 
cgit v1.3-8-gc7d7


From 51d261122d0ffac8cf91cc6e74ffcfea23faeb1c Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Sun, 22 Jul 2007 00:27:01 +1000
Subject: [POWERPC] Constify of_platform_driver match_table

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/linux/of_platform.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 22c3837784b2..448f70b30a0c 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -32,7 +32,7 @@ extern struct bus_type of_platform_bus_type;
 struct of_platform_driver
 {
 	const char		*name;
-	struct of_device_id	*match_table;
+	const struct of_device_id	*match_table;
 	struct module		*owner;
 
 	int	(*probe)(struct of_device* dev,
-- 
cgit v1.3-8-gc7d7


From 41e1703b9b88cf9b5e91cdd2f7dcded3ec3917cb Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <tomof@acm.org>
Date: Sun, 22 Jul 2007 10:06:50 +0900
Subject: [SCSI] bsg: unexport sg v3 helper functions

blk_fill_sghdr_rq, blk_unmap_sghdr_rq, and blk_complete_sghdr_rq were
exported for bsg, however bsg was changed to support only sg v4.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 block/scsi_ioctl.c     | 13 +++++--------
 include/linux/blkdev.h |  5 -----
 2 files changed, 5 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index a26ba07955fe..7bfebd574e55 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -214,8 +214,8 @@ int blk_verify_command(unsigned char *cmd, int has_write_perm)
 }
 EXPORT_SYMBOL_GPL(blk_verify_command);
 
-int blk_fill_sghdr_rq(request_queue_t *q, struct request *rq,
-		      struct sg_io_hdr *hdr, int has_write_perm)
+static int blk_fill_sghdr_rq(request_queue_t *q, struct request *rq,
+			     struct sg_io_hdr *hdr, int has_write_perm)
 {
 	memset(rq->cmd, 0, BLK_MAX_CDB); /* ATAPI hates garbage after CDB */
 
@@ -238,22 +238,20 @@ int blk_fill_sghdr_rq(request_queue_t *q, struct request *rq,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(blk_fill_sghdr_rq);
 
 /*
  * unmap a request that was previously mapped to this sg_io_hdr. handles
  * both sg and non-sg sg_io_hdr.
  */
-int blk_unmap_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr)
+static int blk_unmap_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr)
 {
 	blk_rq_unmap_user(rq->bio);
 	blk_put_request(rq);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(blk_unmap_sghdr_rq);
 
-int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
-			  struct bio *bio)
+static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
+				 struct bio *bio)
 {
 	int r, ret = 0;
 
@@ -287,7 +285,6 @@ int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
 
 	return r;
 }
-EXPORT_SYMBOL_GPL(blk_complete_sghdr_rq);
 
 static int sg_io(struct file *file, request_queue_t *q,
 		struct gendisk *bd_disk, struct sg_io_hdr *hdr)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f78965fc6426..695e34964cb7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -698,11 +698,6 @@ extern int blk_execute_rq(request_queue_t *, struct gendisk *,
 			  struct request *, int);
 extern void blk_execute_rq_nowait(request_queue_t *, struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
-extern int blk_fill_sghdr_rq(request_queue_t *, struct request *,
-		      struct sg_io_hdr *, int);
-extern int blk_unmap_sghdr_rq(struct request *, struct sg_io_hdr *);
-extern int blk_complete_sghdr_rq(struct request *, struct sg_io_hdr *,
-			  struct bio *);
 extern int blk_verify_command(unsigned char *, int);
 
 static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
-- 
cgit v1.3-8-gc7d7


From 74f2345b6be1410f824cb7dd638d2c10a9709379 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 4 Jun 2007 17:00:14 -0400
Subject: [PATCH] allow audit filtering on bit & operations

Right now the audit filter can match on = != > < >= blah blah blah.
This allow the filter to also look at bitwise AND operations, &

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h | 30 +++++++++++++++++-------------
 kernel/auditfilter.c  | 11 +++++++++++
 2 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 8ca7ca0b47f0..a35859ab2fdb 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -161,7 +161,7 @@
  * are currently used in an audit field constant understood by the kernel.
  * If you are adding a new #define AUDIT_<whatever>, please ensure that
  * AUDIT_UNUSED_BITS is updated if need be. */
-#define AUDIT_UNUSED_BITS	0x0FFFFC00
+#define AUDIT_UNUSED_BITS	0x07FFFC00
 
 
 /* Rule fields */
@@ -213,25 +213,29 @@
 #define AUDIT_NEGATE			0x80000000
 
 /* These are the supported operators.
- *	4  2  1
- *	=  >  <
- *	-------
- *	0  0  0		0	nonsense
- *	0  0  1		1	<
- *	0  1  0		2	>
- *	0  1  1		3	!=
- *	1  0  0		4	=
- *	1  0  1		5	<=
- *	1  1  0		6	>=
- *	1  1  1		7	all operators
+ *	4  2  1  8
+ *	=  >  <  ?
+ *	----------
+ *	0  0  0	 0	00	nonsense
+ *	0  0  0	 1	08	&  bit mask
+ *	0  0  1	 0	10	<
+ *	0  1  0	 0	20	>
+ *	0  1  1	 0	30	!=
+ *	1  0  0	 0	40	=
+ *	1  0  0	 1	48	&=  bit test
+ *	1  0  1	 0	50	<=
+ *	1  1  0	 0	60	>=
+ *	1  1  1	 1	78	all operators
  */
+#define AUDIT_BIT_MASK			0x08000000
 #define AUDIT_LESS_THAN			0x10000000
 #define AUDIT_GREATER_THAN		0x20000000
 #define AUDIT_NOT_EQUAL			0x30000000
 #define AUDIT_EQUAL			0x40000000
+#define AUDIT_BIT_TEST			(AUDIT_BIT_MASK|AUDIT_EQUAL)
 #define AUDIT_LESS_THAN_OR_EQUAL	(AUDIT_LESS_THAN|AUDIT_EQUAL)
 #define AUDIT_GREATER_THAN_OR_EQUAL	(AUDIT_GREATER_THAN|AUDIT_EQUAL)
-#define AUDIT_OPERATORS			(AUDIT_EQUAL|AUDIT_NOT_EQUAL)
+#define AUDIT_OPERATORS			(AUDIT_EQUAL|AUDIT_NOT_EQUAL|AUDIT_BIT_MASK)
 
 /* Status symbols */
 				/* Mask values */
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 0ea96bab91cc..359645cff5b2 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -456,6 +456,13 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 		case AUDIT_DEVMINOR:
 		case AUDIT_EXIT:
 		case AUDIT_SUCCESS:
+			/* bit ops are only useful on syscall args */
+			if (f->op == AUDIT_BIT_MASK ||
+						f->op == AUDIT_BIT_TEST) {
+				err = -EINVAL;
+				goto exit_free;
+			}
+			break;
 		case AUDIT_ARG0:
 		case AUDIT_ARG1:
 		case AUDIT_ARG2:
@@ -1566,6 +1573,10 @@ int audit_comparator(const u32 left, const u32 op, const u32 right)
 		return (left > right);
 	case AUDIT_GREATER_THAN_OR_EQUAL:
 		return (left >= right);
+	case AUDIT_BIT_MASK:
+		return (left & right);
+	case AUDIT_BIT_TEST:
+		return ((left & right) == right);
 	}
 	BUG();
 	return 0;
-- 
cgit v1.3-8-gc7d7


From 4259fa01a2d2aa3e589b34ba7624080232d9c1ff Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 Jun 2007 11:13:31 -0400
Subject: [PATCH] get rid of AVC_PATH postponed treatment

        Selinux folks had been complaining about the lack of AVC_PATH
records when audit is disabled.  I must admit my stupidity - I assumed
that avc_audit() really couldn't use audit_log_d_path() because of
deadlocks (== could be called with dcache_lock or vfsmount_lock held).
Shouldn't have made that assumption - it never gets called that way.
It _is_ called under spinlocks, but not those.

        Since audit_log_d_path() uses ab->gfp_mask for allocations,
kmalloc() in there is not a problem.  IOW, the simple fix is sufficient:
let's rip AUDIT_AVC_PATH out and simply generate pathname as part of main
record.  It's trivial to do.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: James Morris <jmorris@namei.org>
---
 include/linux/audit.h  |  2 --
 kernel/auditsc.c       | 47 -----------------------------------------------
 security/selinux/avc.c | 15 ++++++++-------
 3 files changed, 8 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index a35859ab2fdb..4bbd8601b8f0 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -411,7 +411,6 @@ extern int audit_bprm(struct linux_binprm *bprm);
 extern int audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
 extern int __audit_fd_pair(int fd1, int fd2);
-extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt);
 extern int audit_set_macxattr(const char *name);
 extern int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr);
 extern int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec __user *u_abs_timeout);
@@ -491,7 +490,6 @@ extern int audit_signals;
 #define audit_socketcall(n,a) ({ 0; })
 #define audit_fd_pair(n,a) ({ 0; })
 #define audit_sockaddr(len, addr) ({ 0; })
-#define audit_avc_path(dentry, mnt) ({ 0; })
 #define audit_set_macxattr(n) do { ; } while (0)
 #define audit_mq_open(o,m,a) ({ 0; })
 #define audit_mq_timedsend(d,l,p,t) ({ 0; })
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f5e917e60ac2..bde1124d5908 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -173,12 +173,6 @@ struct audit_aux_data_fd_pair {
 	int	fd[2];
 };
 
-struct audit_aux_data_path {
-	struct audit_aux_data	d;
-	struct dentry		*dentry;
-	struct vfsmount		*mnt;
-};
-
 struct audit_aux_data_pids {
 	struct audit_aux_data	d;
 	pid_t			target_pid[AUDIT_AUX_PIDS];
@@ -654,12 +648,6 @@ static inline void audit_free_aux(struct audit_context *context)
 	struct audit_aux_data *aux;
 
 	while ((aux = context->aux)) {
-		if (aux->type == AUDIT_AVC_PATH) {
-			struct audit_aux_data_path *axi = (void *)aux;
-			dput(axi->dentry);
-			mntput(axi->mnt);
-		}
-
 		context->aux = aux->next;
 		kfree(aux);
 	}
@@ -1038,11 +1026,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			audit_log_hex(ab, axs->a, axs->len);
 			break; }
 
-		case AUDIT_AVC_PATH: {
-			struct audit_aux_data_path *axi = (void *)aux;
-			audit_log_d_path(ab, "path=", axi->dentry, axi->mnt);
-			break; }
-
 		case AUDIT_FD_PAIR: {
 			struct audit_aux_data_fd_pair *axs = (void *)aux;
 			audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
@@ -1990,36 +1973,6 @@ void __audit_ptrace(struct task_struct *t)
 	selinux_get_task_sid(t, &context->target_sid);
 }
 
-/**
- * audit_avc_path - record the granting or denial of permissions
- * @dentry: dentry to record
- * @mnt: mnt to record
- *
- * Returns 0 for success or NULL context or < 0 on error.
- *
- * Called from security/selinux/avc.c::avc_audit()
- */
-int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
-{
-	struct audit_aux_data_path *ax;
-	struct audit_context *context = current->audit_context;
-
-	if (likely(!context))
-		return 0;
-
-	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
-	if (!ax)
-		return -ENOMEM;
-
-	ax->dentry = dget(dentry);
-	ax->mnt = mntget(mnt);
-
-	ax->d.type = AUDIT_AVC_PATH;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
-}
-
 /**
  * audit_signal_info - record signal info for shutting down audit subsystem
  * @sig: signal value
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index ecd067384531..0e69adf63bdb 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -570,10 +570,12 @@ void avc_audit(u32 ssid, u32 tsid,
 		case AVC_AUDIT_DATA_FS:
 			if (a->u.fs.dentry) {
 				struct dentry *dentry = a->u.fs.dentry;
-				if (a->u.fs.mnt)
-					audit_avc_path(dentry, a->u.fs.mnt);
-				audit_log_format(ab, " name=");
-				audit_log_untrustedstring(ab, dentry->d_name.name);
+				if (a->u.fs.mnt) {
+					audit_log_d_path(ab, "path=", dentry, a->u.fs.mnt);
+				} else {
+					audit_log_format(ab, " name=");
+					audit_log_untrustedstring(ab, dentry->d_name.name);
+				}
 				inode = dentry->d_inode;
 			} else if (a->u.fs.inode) {
 				struct dentry *dentry;
@@ -624,9 +626,8 @@ void avc_audit(u32 ssid, u32 tsid,
 				case AF_UNIX:
 					u = unix_sk(sk);
 					if (u->dentry) {
-						audit_avc_path(u->dentry, u->mnt);
-						audit_log_format(ab, " name=");
-						audit_log_untrustedstring(ab, u->dentry->d_name.name);
+						audit_log_d_path(ab, "path=",
+								 u->dentry, u->mnt);
 						break;
 					}
 					if (!u->addr)
-- 
cgit v1.3-8-gc7d7


From abd4f7505bafdd6c5319fe3cb5caf9af6104e17a Mon Sep 17 00:00:00 2001
From: Masoud Asgharifard Sharbiani <masouds@google.com>
Date: Sun, 22 Jul 2007 11:12:28 +0200
Subject: x86: i386-show-unhandled-signals-v3

This patch makes the i386 behave the same way that x86_64 does when a
segfault happens.  A line gets printed to the kernel log so that tools
that need to check for failures can behave more uniformly between
debug.show_unhandled_signals sysctl variable to 0 (or by doing echo 0 >
/proc/sys/debug/exception-trace)

Also, all of the lines being printed are now using printk_ratelimit() to
deny the ability of DoS from a local user with a program like the
following:

main()
{
       while (1)
               if (!fork()) *(int *)0 = 0;
}

This new revision also includes the fix that Andrew did which got rid of
new sysctl that was added to the system in earlier versions of this.
Also, 'show-unhandled-signals' sysctl has been renamed back to the old
'exception-trace' to avoid breakage of people's scripts.

AK: Enabling by default for i386 will be likely controversal, but let's see what happens
AK: Really folks, before complaining just fix your segfaults
AK: I bet this will find a lot of silent issues

Signed-off-by: Masoud Sharbiani <masouds@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
[ Personally, I've found the complaints useful on x86-64, so I'm all for
  this. That said, I wonder if we could do it more prettily..   -Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/kernel/signal.c   |  7 +++++++
 arch/i386/kernel/traps.c    |  7 +++++++
 arch/i386/mm/fault.c        | 10 ++++++++++
 arch/x86_64/kernel/signal.c |  2 +-
 arch/x86_64/kernel/traps.c  |  6 ++++--
 arch/x86_64/mm/fault.c      | 15 +++------------
 arch/x86_64/mm/init.c       | 33 ---------------------------------
 include/asm-x86_64/proto.h  |  2 --
 include/linux/signal.h      |  3 +++
 kernel/signal.c             | 10 ++++++++++
 kernel/sysctl.c             | 10 ++++++++++
 11 files changed, 55 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index d574e38f0f77..f5dd85656c18 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -199,6 +199,13 @@ asmlinkage int sys_sigreturn(unsigned long __unused)
 	return eax;
 
 badframe:
+	if (show_unhandled_signals && printk_ratelimit())
+		printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx"
+		       " esp:%lx oeax:%lx\n",
+		    current->pid > 1 ? KERN_INFO : KERN_EMERG,
+		    current->comm, current->pid, frame, regs->eip,
+		    regs->esp, regs->orig_eax);
+
 	force_sig(SIGSEGV, current);
 	return 0;
 }	
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 57772a18c394..438949da3b63 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -618,6 +618,13 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
 
 	current->thread.error_code = error_code;
 	current->thread.trap_no = 13;
+	if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
+	    printk_ratelimit())
+		printk(KERN_INFO
+		    "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
+		    current->comm, current->pid,
+		    regs->eip, regs->esp, error_code);
+
 	force_sig(SIGSEGV, current);
 	return;
 
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index e92a10124935..01ffdd4964f0 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -283,6 +283,8 @@ static inline int vmalloc_fault(unsigned long address)
 	return 0;
 }
 
+int show_unhandled_signals = 1;
+
 /*
  * This routine handles page faults.  It determines the address,
  * and the problem, and then passes it off to one of the appropriate
@@ -469,6 +471,14 @@ bad_area_nosemaphore:
 		if (is_prefetch(regs, address, error_code))
 			return;
 
+		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+		    printk_ratelimit()) {
+			printk("%s%s[%d]: segfault at %08lx eip %08lx "
+			    "esp %08lx error %lx\n",
+			    tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
+			    tsk->comm, tsk->pid, address, regs->eip,
+			    regs->esp, error_code);
+		}
 		tsk->thread.cr2 = address;
 		/* Kernel addresses are always protection faults */
 		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 4886afcd6287..739175b01e06 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -487,7 +487,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 { 
 	struct task_struct *me = current; 
-	if (exception_trace)
+	if (show_unhandled_signals && printk_ratelimit())
 		printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
 	       me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); 
 
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 8713ad4a4db1..03888420775d 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -584,7 +584,8 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
 		tsk->thread.error_code = error_code;
 		tsk->thread.trap_no = trapnr;
 
-		if (exception_trace && unhandled_signal(tsk, signr))
+		if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
+		    printk_ratelimit())
 			printk(KERN_INFO
 			       "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
 			       tsk->comm, tsk->pid, str,
@@ -688,7 +689,8 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
 		tsk->thread.error_code = error_code;
 		tsk->thread.trap_no = 13;
 
-		if (exception_trace && unhandled_signal(tsk, SIGSEGV))
+		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+		    printk_ratelimit())
 			printk(KERN_INFO
 		       "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
 			       tsk->comm, tsk->pid,
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 2074bddd4f04..5e9ac70c135e 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -221,16 +221,6 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
 	return 0;
 } 
 
-int unhandled_signal(struct task_struct *tsk, int sig)
-{
-	if (is_init(tsk))
-		return 1;
-	if (tsk->ptrace & PT_PTRACED)
-		return 0;
-	return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
-		(tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
-}
-
 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
 				 unsigned long error_code)
 {
@@ -302,7 +292,7 @@ static int vmalloc_fault(unsigned long address)
 }
 
 static int page_fault_trace;
-int exception_trace = 1;
+int show_unhandled_signals = 1;
 
 /*
  * This routine handles page faults.  It determines the address,
@@ -494,7 +484,8 @@ bad_area_nosemaphore:
 		    (address >> 32))
 			return;
 
-		if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
+		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+		    printk_ratelimit()) {
 			printk(
 		       "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
 					tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 381c2ecd407e..88678e82e23d 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -697,39 +697,6 @@ int kern_addr_valid(unsigned long addr)
 	return pfn_valid(pte_pfn(*pte));
 }
 
-#ifdef CONFIG_SYSCTL
-#include <linux/sysctl.h>
-
-static ctl_table debug_table2[] = {
-	{
-		.ctl_name	= 99,
-		.procname	= "exception-trace",
-		.data		= &exception_trace,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
-	{}
-}; 
-
-static ctl_table debug_root_table2[] = { 
-	{
-		.ctl_name = CTL_DEBUG,
-		.procname = "debug",
-		.mode = 0555,
-		.child = debug_table2
-	},
-	{}
-}; 
-
-static __init int x8664_sysctl_init(void)
-{ 
-	register_sysctl_table(debug_root_table2);
-	return 0;
-}
-__initcall(x8664_sysctl_init);
-#endif
-
 /* A pseudo VMA to allow ptrace access for the vsyscall page.  This only
    covers the 64bit vsyscall page now. 32bit has a real VMA now and does
    not need special handling anymore. */
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index d6e3225549c0..31f20ad65876 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -75,8 +75,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
 extern void early_quirks(void);
 extern void check_efer(void);
 
-extern int unhandled_signal(struct task_struct *tsk, int sig);
-
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
 
 extern unsigned long table_start, table_end;
diff --git a/include/linux/signal.h b/include/linux/signal.h
index ea91abe740da..0ae338866240 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -237,12 +237,15 @@ extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct
 extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *);
 extern long do_sigpending(void __user *, unsigned long);
 extern int sigprocmask(int, sigset_t *, sigset_t *);
+extern int show_unhandled_signals;
 
 struct pt_regs;
 extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie);
 
 extern struct kmem_cache *sighand_cachep;
 
+int unhandled_signal(struct task_struct *tsk, int sig);
+
 /*
  * In POSIX a signal is sent either to a specific thread (Linux task)
  * or to the process as a whole (Linux thread group).  How the signal
diff --git a/kernel/signal.c b/kernel/signal.c
index 39d122753bac..ef8156a6aad5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -255,6 +255,16 @@ flush_signal_handlers(struct task_struct *t, int force_default)
 	}
 }
 
+int unhandled_signal(struct task_struct *tsk, int sig)
+{
+	if (is_init(tsk))
+		return 1;
+	if (tsk->ptrace & PT_PTRACED)
+		return 0;
+	return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
+		(tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
+}
+
 
 /* Notify the system that a driver wants to block all signals for this
  * process, and wants to be notified if any signals at all were to be
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 222299844ad1..ddebf3f2affe 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1203,6 +1203,16 @@ static ctl_table fs_table[] = {
 };
 
 static ctl_table debug_table[] = {
+#ifdef CONFIG_X86
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "exception-trace",
+		.data		= &show_unhandled_signals,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
-- 
cgit v1.3-8-gc7d7


From e9ed7e722e3f4cea07cf3c4bfe98c18180a17793 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Sat, 21 Jul 2007 23:29:12 +0100
Subject: take declarations of enable_irq() et.al. to linux/interrupt.h

Now that the last inlined instances are gone, all that is left to do
is turning disable_irq_nosync on arm26 and m68k from defines to aliases
and we are all set - we can make these externs in linux/interrupt.h
uncoditional and kill remaining instances in asm/irq.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm26/kernel/armksyms.c |  2 --
 arch/arm26/kernel/irq.c      |  6 ++++++
 arch/m68k/kernel/ints.c      |  4 ++++
 include/asm-alpha/irq.h      |  4 ----
 include/asm-arm26/irq.h      |  5 -----
 include/asm-h8300/irq.h      |  3 ---
 include/asm-ia64/irq.h       |  3 ---
 include/asm-m68k/irq.h       |  3 ---
 include/asm-sh64/irq.h       |  4 ----
 include/asm-sparc/irq.h      |  4 ----
 include/asm-v850/irq.h       | 10 ----------
 include/linux/interrupt.h    |  2 +-
 12 files changed, 11 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm26/kernel/armksyms.c b/arch/arm26/kernel/armksyms.c
index f735d7e018e4..fe1e3ceed7cb 100644
--- a/arch/arm26/kernel/armksyms.c
+++ b/arch/arm26/kernel/armksyms.c
@@ -107,8 +107,6 @@ EXPORT_SYMBOL(__bug);
 #endif
 EXPORT_SYMBOL(__bad_xchg);
 EXPORT_SYMBOL(__readwrite_bug);
-EXPORT_SYMBOL(enable_irq);
-EXPORT_SYMBOL(disable_irq);
 EXPORT_SYMBOL(set_irq_type);
 EXPORT_SYMBOL(pm_idle);
 EXPORT_SYMBOL(pm_power_off);
diff --git a/arch/arm26/kernel/irq.c b/arch/arm26/kernel/irq.c
index d53382c83bf9..2ffe695b062e 100644
--- a/arch/arm26/kernel/irq.c
+++ b/arch/arm26/kernel/irq.c
@@ -95,6 +95,11 @@ void disable_irq(unsigned int irq)
 		desc->enabled = 0;
 	spin_unlock_irqrestore(&irq_controller_lock, flags);
 }
+EXPORT_SYMBOL(disable_irq);
+
+void disable_irq_nosync(unsigned int irq) __attribute__((alias("disable_irq")));
+
+EXPORT_SYMBOL(disable_irq_nosync);
 
 /**
  *	enable_irq - enable interrupt handling on an irq
@@ -131,6 +136,7 @@ void enable_irq(unsigned int irq)
 	}
 	spin_unlock_irqrestore(&irq_controller_lock, flags);
 }
+EXPORT_SYMBOL(enable_irq);
 
 int show_interrupts(struct seq_file *p, void *v)
 {
diff --git a/arch/m68k/kernel/ints.c b/arch/m68k/kernel/ints.c
index 60d4d75f5798..2b412454cb41 100644
--- a/arch/m68k/kernel/ints.c
+++ b/arch/m68k/kernel/ints.c
@@ -326,6 +326,10 @@ void disable_irq(unsigned int irq)
 
 EXPORT_SYMBOL(disable_irq);
 
+void disable_irq_nosync(unsigned int irq) __attribute__((alias("disable_irq")));
+
+EXPORT_SYMBOL(disable_irq_nosync);
+
 int m68k_irq_startup(unsigned int irq)
 {
 	if (irq <= IRQ_AUTO_7)
diff --git a/include/asm-alpha/irq.h b/include/asm-alpha/irq.h
index 917b9fe372cf..06377400dc09 100644
--- a/include/asm-alpha/irq.h
+++ b/include/asm-alpha/irq.h
@@ -85,10 +85,6 @@ static __inline__ int irq_canonicalize(int irq)
 	return ((irq == 2) ? 9 : irq);
 }
 
-extern void disable_irq(unsigned int);
-extern void disable_irq_nosync(unsigned int);
-extern void enable_irq(unsigned int);
-
 struct pt_regs;
 extern void (*perf_irq)(unsigned long, struct pt_regs *);
 
diff --git a/include/asm-arm26/irq.h b/include/asm-arm26/irq.h
index 9aaac87efba9..52971b49ed3b 100644
--- a/include/asm-arm26/irq.h
+++ b/include/asm-arm26/irq.h
@@ -24,11 +24,6 @@
 
 struct irqaction;
 
-#define disable_irq_nosync(i) disable_irq(i)
-
-extern void disable_irq(unsigned int);
-extern void enable_irq(unsigned int);
-
 #define __IRQT_FALEDGE	(1 << 0)
 #define __IRQT_RISEDGE	(1 << 1)
 #define __IRQT_LOWLVL	(1 << 2)
diff --git a/include/asm-h8300/irq.h b/include/asm-h8300/irq.h
index 41be646c3514..56eec28cc2c4 100644
--- a/include/asm-h8300/irq.h
+++ b/include/asm-h8300/irq.h
@@ -59,7 +59,4 @@ static __inline__ int irq_canonicalize(int irq)
 	return irq;
 }
 
-extern void enable_irq(unsigned int);
-extern void disable_irq(unsigned int);
-
 #endif /* _H8300_IRQ_H_ */
diff --git a/include/asm-ia64/irq.h b/include/asm-ia64/irq.h
index 35b360b82e43..a66d26827cbb 100644
--- a/include/asm-ia64/irq.h
+++ b/include/asm-ia64/irq.h
@@ -33,9 +33,6 @@ irq_canonicalize (int irq)
 	return ((irq == 2) ? 9 : irq);
 }
 
-extern void disable_irq (unsigned int);
-extern void disable_irq_nosync (unsigned int);
-extern void enable_irq (unsigned int);
 extern void set_irq_affinity_info (unsigned int irq, int dest, int redir);
 bool is_affinity_mask_valid(cpumask_t cpumask);
 
diff --git a/include/asm-m68k/irq.h b/include/asm-m68k/irq.h
index 4901cb105e2f..eb29a5260591 100644
--- a/include/asm-m68k/irq.h
+++ b/include/asm-m68k/irq.h
@@ -59,9 +59,6 @@
 #define IRQ_USER	8
 
 extern unsigned int irq_canonicalize(unsigned int irq);
-extern void enable_irq(unsigned int);
-extern void disable_irq(unsigned int);
-#define disable_irq_nosync	disable_irq
 
 struct pt_regs;
 
diff --git a/include/asm-sh64/irq.h b/include/asm-sh64/irq.h
index 1ca49e29288a..5c9e6a873aeb 100644
--- a/include/asm-sh64/irq.h
+++ b/include/asm-sh64/irq.h
@@ -114,10 +114,6 @@
 #define	IRL0_PRIORITY	13
 #define TOP_PRIORITY	15
 
-extern void disable_irq(unsigned int);
-extern void disable_irq_nosync(unsigned int);
-extern void enable_irq(unsigned int);
-
 extern int intc_evt_to_irq[(0xE20/0x20)+1];
 int intc_irq_describe(char* p, int irq);
 
diff --git a/include/asm-sparc/irq.h b/include/asm-sparc/irq.h
index afb88a5973f0..61fb99643afd 100644
--- a/include/asm-sparc/irq.h
+++ b/include/asm-sparc/irq.h
@@ -13,10 +13,6 @@
 
 #define irq_canonicalize(irq)	(irq)
 
-extern void disable_irq_nosync(unsigned int irq);
-extern void disable_irq(unsigned int irq);
-extern void enable_irq(unsigned int irq);
-
 extern int request_fast_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, __const__ char *devname);
 
 #endif
diff --git a/include/asm-v850/irq.h b/include/asm-v850/irq.h
index 88687c181f01..7d0d4cd1ce54 100644
--- a/include/asm-v850/irq.h
+++ b/include/asm-v850/irq.h
@@ -50,16 +50,6 @@ init_irq_handlers (int base_irq, int num, int interval,
    interrupt.  */
 extern unsigned int handle_irq (int irq, struct pt_regs *regs);
 
-
-/* Enable interrupt handling on an irq.  */
-extern void enable_irq(unsigned int irq);
-
-/* Disable an irq and wait for completion.  */
-extern void disable_irq (unsigned int irq);
-
-/* Disable an irq without waiting. */
-extern void disable_irq_nosync (unsigned int irq);
-
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __V850_IRQ_H__ */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 5323f6275854..0a3c2ebf2008 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -120,11 +120,11 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
 # define local_irq_enable_in_hardirq()	local_irq_enable()
 #endif
 
-#ifdef CONFIG_GENERIC_HARDIRQS
 extern void disable_irq_nosync(unsigned int irq);
 extern void disable_irq(unsigned int irq);
 extern void enable_irq(unsigned int irq);
 
+#ifdef CONFIG_GENERIC_HARDIRQS
 /*
  * Special lockdep variants of irq disabling/enabling.
  * These should be used for locking constructs that
-- 
cgit v1.3-8-gc7d7